nZR8K5Pk

· 7 years ago · May 08, 2019, 07:20 PM
1import urllib.request
2import json
3import re
4import sqlite3
5from datetime import datetime
6from pymystem3 import Mystem
7import matplotlib.pyplot as plt
8import operator
9from wordcloud import WordCloud
10
11
12token = ''  # need to obtain Service token
13owner_id = -84793390  # ÐÐºÐ°Ð´ÐµÐ¼Ð¸Ñ Ð¯Ð½Ð´ÐµÐºÑÐ°
14
15a = 'https://api.vk.com/method/'
16b = 'wall.get?'
17b2 = 'wall.getComments?'
18c = 'owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
19c2 = 'post_id=%i&owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
20
21conn = sqlite3.connect('hw6_data.db')
22c1 = conn.cursor()
23
24
25def get_stop_words_from_file(file_local):
26    stop_words = list([])
27    f = open(file_local, 'r', encoding='utf-8')
28    f1 = f.readlines()
29    for x in f1:
30        stop_words.append(x.split('\n')[0])
31
32    return stop_words
33
34
35def get_sex_by_user_id(user_id):
36    # 0 - None, 1 - female, 2 - male
37    part_1 = 'https://api.vk.com/method/users.get?'
38    part_2 = 'v=%s&access_token=%s&user_ids=%s&fields=sex'
39    if user_id <= 0:
40        return -1
41    version = 5.92
42    req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
43    response = urllib.request.urlopen(req)
44    result = response.read().decode('utf-8')
45    data = json.loads(result)
46    sex = -1
47    if 'sex' in (data['response'][0]):
48        sex = data['response'][0]['sex']
49    return sex
50
51
52def get_is_private_name_sex_by_user_id(user_id):
53
54    version = 5.92
55    part_1 = 'https://api.vk.com/method/users.get?'
56    part_2 = 'v=%s&access_token=%s&user_ids=%s&fields=sex'
57    req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
58    response = urllib.request.urlopen(req)
59    result = response.read().decode('utf-8')
60    data = json.loads(result)
61    is_private = 0
62    if 'deactivated' in data['response'][0]:
63        is_private = 0
64    elif data['response'][0]['is_closed'] is True:
65        is_private = 1
66    name = data['response'][0]['first_name'] + ' '
67    name = name + data['response'][0]['last_name']
68    sex = -1
69    if 'sex' in (data['response'][0]):
70        sex = data['response'][0]['sex']
71    return is_private, name, sex
72
73
74def get_number_of_friends_by_id(user_id):
75
76    if user_id < 0:
77        return - 1
78    version = 5.92
79    part_1 = 'https://api.vk.com/method/friends.get?'
80    part_2 = 'v=%s&access_token=%s&user_id=%s'
81    req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
82    response = urllib.request.urlopen(req)
83    result = response.read().decode('utf-8')
84    data = json.loads(result)
85    if 'response' not in data:
86        return -1
87    return data['response']['count']
88
89
90def get_posts(how_many):
91        count = 0
92        r = how_many % 100
93        local_posts = {}
94        i = 0
95        while i < how_many//100:
96            p1 = a + b + c
97            req = urllib.request.Request(p1 % (owner_id, 100*i, 100, token))
98            response = urllib.request.urlopen(req)
99            result = response.read().decode('utf-8')
100            data1 = json.loads(result)
101            data_length = len(data1['response']['items'])
102
103            for j in range(min(100, data_length)):
104                local_posts[j+100*i] = data1['response']['items'][j]
105
106            count = i + 1
107            i = i + 1
108            if data_length < 100:
109                i = how_many
110                r = 0
111
112        if r > 0:
113            p2 = a + b + c
114            req = urllib.request.Request(p2 % (owner_id, 100*count, r, token))
115            response = urllib.request.urlopen(req)
116            result = response.read().decode('utf-8')
117            data1 = json.loads(result)
118
119            for i in range(r):
120                local_posts[i+100*count] = data1['response']['items'][i]
121
122        return local_posts
123
124
125def get_thread_comments(local_comments, main_comment, offset):
126    c_id = main_comment["id"]
127    count_thread = main_comment["thread"]["count"]
128
129    count = 0
130    for i in range(count_thread//100):
131        p1 = 'https://api.vk.com/method/wall.getComments?'
132        p2 = 'comment_id=%i&owner_id=%i&offset=%i&'
133        p3 = p1 + p2 + 'count=100&v=5.92&access_token=%s'
134        req1 = urllib.request.Request(p3 % (c_id, owner_id, 100*count, token))
135        response1 = urllib.request.urlopen(req1)
136        result1 = response1.read().decode('utf-8')
137        data3 = json.loads(result1)
138
139        for j in range(100):
140            t1 = "This comment has been deleted"
141            t2 = data3['response']['items'][j]["id"]
142            t3 = "user_id"
143            t4 = "text"
144            if "deleted" in data3['response']['items'][j]:
145                local_comments[offset+i*count+j] = {t4: t1, t3: -1, 'id': t2}
146            else:
147                t5 = data3['response']['items'][j]["text"]
148                t6 = data3['response']['items'][j]["from_id"]
149                local_comments[offset+i*count+j] = {t4: t5, t3: t6, 'id': t2}
150        count += 1
151
152    rg = count_thread % 100
153    p4 = 'https://api.vk.com/method/wall.getComments?comment_id=%i'
154    p5 = '&owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
155    n8 = 100*count
156    req1 = urllib.request.Request(p4 + p5 % (c_id, owner_id, n8, rg, token))
157    response1 = urllib.request.urlopen(req1)
158    result1 = response1.read().decode('utf-8')
159    data3 = json.loads(result1)
160    for j in range(rg):
161        s1 = "text"
162        s2 = "This comment has been deleted"
163        s3 = "user_id"
164        s4 = data3['response']['items'][j]["id"]
165        s5 = data3['response']['items'][j]["text"]
166        s6 = data3['response']['items'][j]["from_id"]
167        if "deleted" in data3['response']['items'][j]:
168            local_comments[offset+100*count+j] = {s1: s2, s3: -1, 'id': s4}
169        else:
170            local_comments[offset+100*count+j] = {s1: s5, s3: s6, 'id': s4}
171
172    return local_comments
173
174
175def get_comments(post_local):
176    count = 0
177    local_comments = {}
178    post_id = post_local["id"]
179    number_of_comments = post_local['comments']['count']
180    r = number_of_comments % 100
181
182    for i in range(number_of_comments//100):
183        p1 = a + b2 + c2
184        l = 100*i
185        req = urllib.request.Request(p1 % (post_id, owner_id, l, 100, token))
186        response = urllib.request.urlopen(req)
187        result = response.read().decode('utf-8')
188        data2 = json.loads(result)
189
190        for j in range(100):
191            s1 = "text"
192            s2 = "This comment has been deleted"
193            s3 = "user_id"
194            s4 = data2['response']['items'][j]["id"]
195            if "deleted" in data2['response']['items'][j]:
196                local_comments[j+100*i] = {s1: s2, s3: -1, 'id': s4}
197            else:
198                s5 = data2['response']['items'][j]["text"]
199                s6 = data2['response']['items'][j]["from_id"]
200                local_comments[j+100*i] = {s1: s5, s3: s6, 'id': s4}
201
202            n = data2["response"]['items'][j]["thread"]["count"]
203            if n > 0:
204                s7 = local_comments
205                s8 = data2["response"]['items'][j]
206                local_comments = get_thread_comments(s7, s8, j+100*i + 1)
207                j += n
208
209        count = i + 1
210
211    if r > 0:
212        j1 = a + b2 + c2
213        i1 = 100*count
214        req = urllib.request.Request(j1 % (post_id, owner_id, i1, r, token))
215        response = urllib.request.urlopen(req)
216        result = response.read().decode('utf-8')
217        data2 = json.loads(result)
218        k, j = 0, 0
219
220        while j < r:
221            s1 = "text"
222            s2 = "This comment has been deleted"
223            s3 = "user_id"
224            s4 = data2['response']['items'][k]["id"]
225            if "deleted" in data2['response']['items'][k]:
226                local_comments[100*count + j] = {s1: s2, s3: -1, 'id': s4}
227            else:
228                s5 = data2['response']['items'][k]["text"]
229                s6 = data2['response']['items'][k]["from_id"]
230                local_comments[100*count + j] = {s1: s5, s3: s6, 'id': s4}
231
232            n = data2["response"]['items'][k]["thread"]["count"]
233            if n > 0:
234                u1 = data2["response"]['items'][k]
235                e1 = 100*count + j + 1
236                local_comments = get_thread_comments(local_comments, u1, e1)
237                j += n
238            k += 1
239            j += 1
240
241    for k in range(len(local_comments)):
242        t111 = local_comments[k]['text']
243        local_comments[k]['text'] = re.sub(r'\[id\d+\|', '', t111)
244
245    return local_comments
246
247
248def get_post_or_comment_length(text_local):
249    length = 0
250    if text_local != "This comment has been deleted":
251        length = len(re.compile('\w+').findall(text_local))
252    return length
253
254
255def get_average_comments_lengths(local_comments):
256    length_of_comments = 0
257    how_many_comments = len(local_comments)
258    for i in range(len(local_comments)):
259        if local_comments[i][3] != "This comment has been deleted":
260            lc = local_comments[i][3]
261            length_of_comments += len(re.compile('\w+').findall(lc))
262        else:
263            how_many_comments -= 1
264
265    average_length_of_comments = 0
266    if how_many_comments > 0:
267        average_length_of_comments = length_of_comments/how_many_comments
268
269    return average_length_of_comments
270
271
272def get_time_of_post_in_hours(unix_time):
273    utc = datetime.fromtimestamp(unix_time)
274    hours_24_format = (utc.hour*3600 + utc.minute*60 + utc.second)/3600
275    return hours_24_format
276
277
278def get_post_weekday(post_local):
279    unixtime = post_local['date']
280    utc = datetime.fromtimestamp(unixtime)
281    post_weekday = datetime.weekday(utc)  # 0 - monday
282    return post_weekday + 1  # 1 - monday
283
284
285def get_data_from_file():
286
287    file_local_1 = open("data_unlemmatized.txt", "r", encoding='utf-8')
288    texts_local = file_local_1.read()
289    file_local_1.close()
290
291    return texts_local
292
293
294def save_lemmatized_data(texts_local):
295    m = Mystem()
296    file_local = open("data_lemmatized.txt", "w", encoding='utf-8')
297    file_local.write('')
298    file_local.close()
299
300    file_local = open("data_lemmatized.txt", "a", encoding='utf-8')
301    text = ''
302    count = 0
303    for line_local in texts_local.split('\n'):
304        text = text + line_local + ' '
305        if count > 10:
306            lemma = m.lemmatize(text)
307            file_local.write(''.join(lemma))
308            text = ''
309            count = 0
310        count += 1
311
312    lemma = m.lemmatize(text)
313    file_local.write(''.join(lemma))
314    file_local.close()
315
316
317def save_posts_data_to_database(posts_local):
318    file_for_rewriting = open("data_lemmatized.txt", "w", encoding='utf-8')
319    file_for_rewriting.write('')
320    file_for_rewriting.close()
321
322    file_for_rewriting = open("data_unlemmatized.txt", "w", encoding='utf-8')
323    file_for_rewriting.write('')
324    file_for_rewriting.close()
325
326    file_local = open('data_unlemmatized.txt', "a", encoding='utf-8')
327
328    s1 = 'INSERT INTO posts VALUES (?, ?, ?, ?, ?, ?)'
329    for i in range(len(posts_local)):
330        post_id = posts_local[i]['id']
331        owner_id_local = posts_local[i]['owner_id']
332        day = get_post_weekday(posts_local[i])
333        c_count = posts_local[i]['comments']['count']
334        text = posts_local[i]['text']
335        file_local.write(text + '\n')
336        p_date = posts_local[i]['date']
337        c1.execute(s1, (post_id, owner_id_local, day, c_count, text, p_date))
338        conn.commit()
339
340    file_local.close()
341
342
343def save_comments_data_to_database(posts_local):
344    file_local = open("data_unlemmatized.txt", "a", encoding='utf-8')
345
346    s1 = 'INSERT INTO comments VALUES (?, ?, ?, ?)'
347    for i in range(len(posts_local)):
348        comments_local = get_comments(posts_local[i])
349        for j in range(len(comments_local)):
350            c_id = comments_local[j]['id']
351            user_id = comments_local[j]['user_id']
352            post_id = posts_local[i]['id']
353            text = comments_local[j]['text']
354            file_local.write(text + '\n')
355            c1.execute(s1, (c_id, user_id, post_id, text))
356            conn.commit()
357            if user_id > 0:
358                save_users_data_to_database(user_id)
359
360    file_local.close()
361
362
363def save_users_data_to_database(user_id):
364    c1.execute('SELECT * FROM users WHERE user_id = ' + str(user_id))
365    s1 = 'INSERT INTO users VALUES (?, ?, ?, ?, ?)'
366    if c1.fetchone() is None and user_id > 0:
367        is_private, name, sex = get_is_private_name_sex_by_user_id(user_id)
368        nf = get_number_of_friends_by_id(user_id)
369        c1.execute(s1, (user_id, is_private, name, sex, nf))
370        conn.commit()
371
372# ################### Graphs ###################
373
374
375def graph_1_posts_and_comments_lengths(posts_local):
376
377    len_posts = len(posts_local)
378    data_x_y = [{}]*len_posts
379    s1 = 'SELECT * FROM comments WHERE post_id = '
380    for h in range(len_posts):
381        c1.execute(s1 + str(posts_local[h][0]))
382        comments_local = c1.fetchall()
383        x = get_post_or_comment_length(posts_local[h][4])
384        y = get_average_comments_lengths(comments_local)
385        data_x_y[h] = {'x': x, 'y': y}
386
387    data_x_y = sorted(data_x_y, key=lambda k: k['x'])
388    X = [0]*len_posts
389    Y = [0]*len_posts
390
391    for i in range(len_posts):
392        X[i] = data_x_y[i]['x']
393        Y[i] = data_x_y[i]['y']
394
395    plt.scatter(X, Y, s=30, c="#ceaefa", marker="o")
396    t1 = "Ð“Ñ€Ð°Ñ„Ð¸Ðº Ð·Ð°Ð²Ð¸ÑÐ¸Ð¼Ð¾ÑÑ‚Ð¸ ÑÑ€ÐµÐ´Ð½ÐµÐ¹ Ð´Ð»Ð¸Ð½Ñ‹ ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸ÐµÐ² Ð¾Ñ‚ Ð´Ð»Ð¸Ð½Ñ‹ Ð¿Ð¾ÑÑ‚Ð°"
397    plt.title(t1)
398    plt.xlabel("Ð”Ð»Ð¸Ð½Ð° Ð¿Ð¾ÑÑ‚Ð°, ÑÐ»Ð¾Ð²Ð°")
399    plt.ylabel("Ð¡Ñ€ÐµÐ´Ð½ÑÑ Ð´Ð»Ð¸Ð½Ð° ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸Ñ, ÑÐ»Ð¾Ð²Ð°")
400    plt.xlim(-3, 300)
401    plt.ylim(-0.5, 50)
402    plt.show()
403
404
405def graph_2_post_time_and_length(posts_local):
406
407    len_posts = len(posts_local)
408    data_x_y = list([])
409    for h in range(24):
410        data_x_y.append([0, 0])
411
412    s1 = 'SELECT date FROM posts WHERE post_id = '
413    for h in range(len_posts):
414        c1.execute(s1 + str(posts_local[h][0]))
415        date_local = c1.fetchall()[0][0]
416        post_time = get_time_of_post_in_hours(date_local)
417        post_length = get_post_or_comment_length(posts_local[h][4])
418        decision_hat = int(post_time//1)
419        data_x_y[decision_hat][0] += post_length
420        data_x_y[decision_hat][1] += 1
421
422    X = []
423    Y = []
424    l_time = list()
425
426    count_local = 0
427    for i in range(24):
428        g = i
429        if i == 23:
430            g = -1
431        y_count = data_x_y[i][1]
432        if y_count != 0:
433            X.append(count_local+1)
434            Y.append(data_x_y[i][0]//y_count)
435            l_time.append('%d:01-%d:00' % (i, g + 1))
436            count_local += 1
437
438    t1 = "Ð“Ñ€Ð°Ñ„Ð¸Ðº Ð·Ð°Ð²Ð¸ÑÐ¸Ð¼Ð¾ÑÑ‚Ð¸ ÑÑ€ÐµÐ´Ð½ÐµÐ¹ Ð´Ð»Ð¸Ð½Ñ‹ Ð¿Ð¾ÑÑ‚Ð° Ð¾Ñ‚ Ð²Ñ€ÐµÐ¼ÐµÐ½Ð¸ Ð¿ÑƒÐ±Ð»Ð¸ÐºÐ°Ñ†Ð¸Ð¸ Ð¿Ð¾ÑÑ‚Ð°"
439    plt.scatter(X, Y, s=60, c="orange", marker="o")
440    plt.title(t1)
441    plt.xlabel("Ð’Ñ€ÐµÐ¼Ñ Ð¿ÑƒÐ±Ð»Ð¸ÐºÐ°Ñ†Ð¸Ð¸ Ð¿Ð¾ÑÑ‚Ð°, Ñ‡Ð°ÑÑ‹")
442    plt.ylabel("Ð¡Ñ€ÐµÐ´Ð½ÑÑ Ð´Ð»Ð¸Ð½Ð° Ð¿Ð¾ÑÑ‚Ð°, ÑÐ»Ð¾Ð²Ð°")
443    plt.subplots_adjust(bottom=0.3)
444    plt.xticks(X, l_time, rotation=45)
445    plt.xlim(0, count_local + 1)
446    plt.ylim(-3, 150)
447    plt.show()
448
449
450def graph_3_post_weekday_and_length(posts_local):
451    len_posts = len(posts_local)
452    data_x_y = list([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]])
453
454    for h in range(len_posts):
455        post_weekday = posts_local[h][2]
456        post_length = get_post_or_comment_length(posts_local[h][4])
457        data_x_y[post_weekday-1][0] += post_length
458        data_x_y[post_weekday-1][1] += 1
459
460    X = []
461    Y = []
462
463    for i in range(7):
464        X.append(i + 1)
465        y_count = data_x_y[i][1]
466        if y_count == 0:
467            y_count = 1
468        Y.append(data_x_y[i][0]/y_count)
469
470    plt.plot(X, Y, c="#ff6163")
471    plt.title("Ð“Ñ€Ð°Ñ„Ð¸Ðº Ð·Ð°Ð²Ð¸ÑÐ¸Ð¼Ð¾ÑÑ‚Ð¸ ÑÑ€ÐµÐ´Ð½ÐµÐ¹ Ð´Ð»Ð¸Ð½Ñ‹ Ð¿Ð¾ÑÑ‚Ð° Ð¾Ñ‚ Ð´Ð½Ñ Ð½ÐµÐ´ÐµÐ»Ð¸ Ð¿ÑƒÐ±Ð»Ð¸ÐºÐ°Ñ†Ð¸Ð¸")
472    plt.xlabel("Ð”ÐµÐ½ÑŒ Ð½ÐµÐ´ÐµÐ»Ð¸ Ð¿ÑƒÐ±Ð»Ð¸ÐºÐ°Ñ†Ð¸Ð¸ Ð¿Ð¾ÑÑ‚Ð°")
473    plt.ylabel("Ð”Ð»Ð¸Ð½Ð° Ð¿Ð¾ÑÑ‚Ð°, ÑÐ»Ð¾Ð²Ð°")
474    days_list = list(['ÐŸÐ¾Ð½ÐµÐ´ÐµÐ»ÑŒÐ½Ð¸Ðº', 'Ð’Ñ‚Ð¾Ñ€Ð½Ð¸Ðº', 'Ð¡Ñ€ÐµÐ´Ð°', 'Ð§ÐµÑ‚Ð²ÐµÑ€Ð³', 'ÐŸÑÑ‚Ð½Ð¸Ñ†Ð°'])
475    days_list.append('Ð¡ÑƒÐ±Ð±Ð¾Ñ‚Ð°')
476    days_list.append('Ð’Ð¾ÑÐºÑ€ÐµÑÐµÐ½ÑŒÐµ')
477    plt.xticks(X, days_list, rotation=45)
478    plt.subplots_adjust(bottom=0.3)
479    plt.xlim(0.5, 7.5)
480    plt.ylim(0, 110)
481    plt.show()
482
483
484def graph_4_sex_and_comments_length(posts_local):
485
486    len_posts = len(posts_local)
487    male_sum = 0
488    male_count = 0
489    female_sum = 0
490    female_count = 0
491
492    s1 = 'SELECT * FROM comments WHERE post_id = '
493    s2 = 'SELECT sex FROM users WHERE user_id = '
494    for h in range(len_posts):
495        c1.execute(s1 + str(posts_local[h][0]))
496        comments_local = c1.fetchall()
497        for i in range(len(comments_local)):
498            y = get_post_or_comment_length(comments_local[i][3])
499            user_id = comments_local[i][1]
500            if user_id < 1:
501                continue
502            c1.execute(s2 + str(user_id))
503            sex = c1.fetchone()[0]
504            if sex > 0:
505                x = sex
506                if x == 2:
507                    male_sum += y
508                    male_count += 1
509                else:
510                    female_sum += y
511                    female_count += 1
512
513    X = [1, 2]
514    Y = [female_sum/female_count, male_sum/male_count]
515    plt.bar(X[0], Y[0], color='#9d0759')
516    plt.bar(X[1], Y[1], color='#3a18b1')
517    plt.xticks(X, ['Ð–ÐµÐ½ÑÐºÐ¸Ð¹', 'ÐœÑƒÐ¶ÑÐºÐ¾Ð¹'], rotation=0)
518    t0 = "Ð“Ñ€Ð°Ñ„Ð¸Ðº Ð·Ð°Ð²Ð¸ÑÐ¸Ð¼Ð¾ÑÑ‚Ð¸ ÑÑ€ÐµÐ´Ð½ÐµÐ¹ Ð´Ð»Ð¸Ð½Ñ‹ ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸ÐµÐ²"
519    t1 = t0 + " Ð¾Ñ‚ Ð¿Ð¾Ð»Ð° Ð°Ð²Ñ‚Ð¾Ñ€Ð° ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸Ñ"
520    plt.title(t1)
521    plt.xlabel("ÐŸÐ¾Ð»")
522    plt.ylabel("Ð¡Ñ€ÐµÐ´Ð½ÑÑ Ð´Ð»Ð¸Ð½Ð° ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸Ñ, ÑÐ»Ð¾Ð²Ð°")
523    plt.subplots_adjust(bottom=0.2)
524    plt.xlim(0, 3)
525    plt.ylim(0, 25)
526    plt.show()
527
528
529def graph_5_number_of_friends_and_comments_length(posts_local):
530
531    len_posts = len(posts_local)
532    z1 = []
533    for uu in range(11):
534        z1.append([0, 0])
535    data_x_y = list(z1)  # [0][1] - 0 is sum, 1 is count. Use ranking
536    s1 = 'SELECT number_of_friends FROM users WHERE user_id = '
537    for h in range(len_posts):
538        post_id = posts_local[h][0]
539        c1.execute('SELECT * FROM comments WHERE post_id = ' + str(post_id))
540        comments_local = c1.fetchall()
541        for i in range(len(comments_local)):
542            user_id = comments_local[i][1]
543            if user_id < 1:
544                continue
545            c1.execute(s1 + str(user_id))
546            number_of_friends = c1.fetchone()[0]
547            if number_of_friends < 0:
548                continue
549            comment_length = get_post_or_comment_length(comments_local[i][3])
550            if number_of_friends > -1:
551                decision_hat = number_of_friends//30
552                if decision_hat > 10:
553                    decision_hat = 10
554                data_x_y[decision_hat][0] += comment_length
555                data_x_y[decision_hat][1] += 1
556
557    length_data = len(data_x_y)
558    X = []
559    Y = []
560
561    for i in range(length_data):
562        X.append(i+1)
563        if data_x_y[i][1] == 0:
564            data_x_y[i][1] = 1
565        Y.append(data_x_y[i][0]/data_x_y[i][1])
566
567    plt.plot(X, Y, c='#fed0fc', linewidth=3)
568    t1 = "Ð“Ñ€Ð°Ñ„Ð¸Ðº Ð·Ð°Ð²Ð¸ÑÐ¸Ð¼Ð¾ÑÑ‚Ð¸ ÑÑ€ÐµÐ´Ð½ÐµÐ¹ Ð´Ð»Ð¸Ð½Ñ‹ ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸ÐµÐ² Ð¾Ñ‚ ÐºÐ¾Ð»Ð¸Ñ‡ÐµÑÑ‚Ð²Ð° Ð´Ñ€ÑƒÐ·ÐµÐ¹"
569    plt.title(t1)
570    list_hours = list(['0-30', '31-60', '61-90', '91-120', '121-150'])
571    list_hours.append('151-180')
572    list_hours.append('181-210')
573    list_hours.append('211-240')
574    list_hours.append('241-270')
575    list_hours.append('271-300')
576    list_hours.append('>300')
577
578    plt.xticks(X, list_hours, rotation=45)
579    plt.xlim(0.5, 11.5)
580    plt.ylim(0, 25)
581    plt.xlabel("ÐšÐ¾Ð»Ð¸Ñ‡ÐµÑÑ‚Ð²Ð¾ Ð´Ñ€ÑƒÐ·ÐµÐ¹")
582    plt.ylabel("Ð”Ð»Ð¸Ð½Ð° ÐºÐ¾Ð¼Ð¼ÐµÐ½Ñ‚Ð°Ñ€Ð¸Ñ, ÑÐ»Ð¾Ð²Ð°")
583    plt.subplots_adjust(bottom=0.2)
584    plt.show()
585
586
587def graph_6_top_30_words_unlemmatized():
588
589    file_local = open("data_unlemmatized.txt", "r", encoding='utf-8')
590    texts_local = file_local.read()
591    file_local.close()
592    stop_words = get_stop_words_from_file('stop_ru.txt')
593    words_count = {}
594    words = re.compile('\w+').findall(texts_local.lower())
595
596    for word in words:
597        if word not in stop_words:
598            if words_count.get(word) is None:
599                words_count[word] = 1
600            else:
601                words_count[word] += 1
602    ky = operator.itemgetter(1)
603    words_count = sorted(words_count.items(), key=ky, reverse=True)
604    words_top = list([])
605    X = list([])
606    Y = list([])
607
608    for m in range(min(30, len(words_count))):
609        words_top.append(words_count[m][0])
610        X.append(m + 1)
611        Y.append(words_count[m][1])
612
613    plt.bar(X, Y, color='#fffe71')
614    plt.title("Ð¢Ð¾Ð¿ 30 ÑÐ»Ð¾Ð² Ð¿Ð¾ Ð½ÐµÐ»ÐµÐ¼Ð¼Ð°Ñ‚Ð¸Ð·Ð¸Ñ€Ð¾Ð²Ð°Ð½Ð½Ñ‹Ð¼ Ñ‚ÐµÐºÑÑ‚Ð°Ð¼")
615    plt.xlabel("Ð¡Ð»Ð¾Ð²Ð°")
616    plt.ylabel("Ð§Ð°ÑÑ‚Ð¾Ñ‚Ð½Ð¾ÑÑ‚ÑŒ ÑÐ»Ð¾Ð²")
617    plt.xticks(X, words_top, rotation=60)
618    plt.subplots_adjust(bottom=0.3)
619    plt.xlim(0, 31)
620    plt.ylim(0, 150)
621    plt.show()
622
623
624def graph_7_and_cloud_top_30_words_lemmatized():
625
626    file_local = open("data_lemmatized.txt", "r", encoding='utf-8')
627    texts_local = file_local.read()
628    file_local.close()
629    stop_words = get_stop_words_from_file('stop_ru.txt')
630    words_count = dict()
631    words = re.compile('\w+').findall(texts_local.lower())
632
633    for word in words:
634        if word not in stop_words:
635            if words_count.get(word) is None:
636                words_count[word] = 1
637            else:
638                words_count[word] += 1
639
640    ky = operator.itemgetter(1)
641    words_count1 = sorted(words_count.items(), key=ky, reverse=True)
642    words_top = list([])
643    X = list([])
644    Y = list([])
645
646    for m in range(min(30, len(words_count1))):
647        words_top.append(words_count1[m][0])
648        X.append(m + 1)
649        Y.append(words_count1[m][1])
650
651    plt.bar(X, Y, color='#89a203')
652    plt.title("Ð¢Ð¾Ð¿ 30 ÑÐ»Ð¾Ð² Ð¿Ð¾ Ð»ÐµÐ¼Ð¼Ð°Ñ‚Ð¸Ð·Ð¸Ñ€Ð¾Ð²Ð°Ð½Ð½Ñ‹Ð¼ Ñ‚ÐµÐºÑÑ‚Ð°Ð¼")
653    plt.xlabel("Ð¡Ð»Ð¾Ð²Ð°")
654    plt.ylabel("Ð§Ð°ÑÑ‚Ð¾Ñ‚Ð½Ð¾ÑÑ‚ÑŒ ÑÐ»Ð¾Ð²")
655    plt.xticks(X, words_top, rotation=60)
656    plt.subplots_adjust(bottom=0.3)
657    plt.xlim(0, 31)
658    plt.ylim(0, 300)
659    plt.show()
660
661    cloud = WordCloud(background_color="white", max_words=30)
662    cloud.generate_from_frequencies(words_count)
663    plt.imshow(cloud, interpolation='bilinear')
664    plt.axis("off")
665    plt.show()
666
667    cloud.to_file("words_cloud.png")
668
669
670# ##############################################################
671
672
673def run_on_existing_data():
674
675    c1.execute('SELECT * FROM posts')
676    posts_all = c1.fetchall()
677    graph_1_posts_and_comments_lengths(posts_all)
678    graph_2_post_time_and_length(posts_all)
679    graph_3_post_weekday_and_length(posts_all)
680    graph_4_sex_and_comments_length(posts_all)
681    graph_5_number_of_friends_and_comments_length(posts_all)
682    graph_6_top_30_words_unlemmatized()
683    graph_7_and_cloud_top_30_words_lemmatized()
684
685
686def rewrite_all_data_and_run():
687
688    c1.execute('DROP TABLE IF EXISTS posts')
689    c1.execute('DROP TABLE IF EXISTS comments')
690    c1.execute('DROP TABLE IF EXISTS users')
691
692    pq1 = "CREATE TABLE IF NOT EXISTS posts(post_id integer, owner_id integer"
693    pq2 = pq1 + ", weekday integer, comments integer, text text, date integer)"
694    pq3 = "CREATE TABLE IF NOT EXISTS comments(id integer, user_id integer"
695    pq4 = pq3 + ", post_id integer, text text)"
696    pq5 = "CREATE TABLE IF NOT EXISTS users(user_id integer, is_private integer"
697    pq6 = pq5 + ", name text, sex integer, number_of_friends integer)"
698    c1.execute(pq2)
699    c1.execute(pq4)
700    c1.execute(pq6)
701
702    posts = get_posts(151)  # WARNING!!!
703    save_posts_data_to_database(posts)  # WARNING!!!
704    save_comments_data_to_database(posts)  # WARNING!!!
705    save_lemmatized_data(get_data_from_file())  # WARNING!!!
706
707    c1.execute('SELECT * FROM posts')
708    posts_all = c1.fetchall()
709    graph_1_posts_and_comments_lengths(posts_all)
710    graph_2_post_time_and_length(posts_all)
711    graph_3_post_weekday_and_length(posts_all)
712    graph_4_sex_and_comments_length(posts_all)
713    graph_5_number_of_friends_and_comments_length(posts_all)
714    graph_6_top_30_words_unlemmatized()
715    graph_7_and_cloud_top_30_words_lemmatized()
716
717
718run_on_existing_data()
719
720conn.close()