· 5 years ago · Jul 09, 2020, 01:10 PM
1# -*- coding: UTF-8 -*-
2from bs4 import BeautifulSoup as bs
3import requests
4import re
5import time
6import threading
7from random_word import RandomWords
8headers = {'accept-language':'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7'}
9
10request = []
11all_emails = []
12blacklist = ['@mail.ru @yandex.ru']
13WORDS = 200#Кол-во слов для автогенерации
14start = 0
15end = 100
16min_subs = 20000#Минимальное кол-во сабов
17max_subs = 350000#Максимальное кол-во сабов
18class Parser():
19 def _init__(self):
20 pass
21
22 def get_urls_to_pars(self, request):
23 session = requests.Session()
24 all_urls = []
25 for j in request:
26 try:
27 urls_content = session.get(f'https://www.youtube.com/results?search_query{j}&sp=EgQIBBAB', headers=headers)
28 urls_bs = bs(urls_content.text,'html.parser')
29 urls = urls_bs.find_all('a', attrs={'dir':'ltr'})
30 for i in urls:
31 if '/watch' in i['href']:
32 all_urls.append(i['href'])
33 except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError) as err:
34 print(f'Ошибка с соединением. Возможно вы посылаете много запросов | {err}')
35 except Exception as err1:
36 print('Ошибка')
37 continue
38 return all_urls
39
40 def pars_mails(self, urls, start, end):
41 session = requests.Session()
42 for i in range(start, end):
43 try:
44 email_content = session.get(f'https://www.youtube.com{urls[i]}', headers=headers)
45 pattern = r"[\w\.-]+@[\w\.-]+"
46 mail = re.search(pattern,email_content.text.replace("\\n", ""))
47 if mail and mail[0] not in all_emails:
48 subs = bs(email_content.text,'html.parser')
49 subs_count = subs.find('span', attrs={'class':'yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count'})
50 subi = subs_count["aria-label"]
51 channel_url = subs.find('meta', attrs={'itemprop':'channelId'}).get('content')
52 channel_url= 'https://www.youtube.com/channel/'+channel_url
53 if 'тыс' in subi and ',' in subi:
54 subi = int(subi[:subi.find(',')].replace("\xa0",''))*1000
55 elif 'тыс' in subi:
56 subi = int(subi[:subi.find('т')].replace("\xa0", ''))*1000
57 elif 'млн' in subi and ',' in subi:
58 subi = int(subi[:subi.find(',')].replace("\xa0",''))*1000000
59 elif 'млн' in subi:
60 subi = int(subi[:subi.find('м')].replace("\xa0", ''))*1000000
61 try:
62 subi = int(subi)
63 except:
64 pass
65 if subi<=max_subs and subi>=min_subs and mail[0] not in all_emails and '.' in mail[0]:
66 all_emails.append(mail[0])
67 for j in blacklist:
68 if j in mail[0]:
69 raise Exception
70 results = open('mails.txt', 'a')
71 results2 = open('mails_full.txt', 'a')
72 results3 = open('mails_db.txt', 'a')
73 print(f'subs - {subi}, email - {mail[0]}, url - {channel_url}')
74 try:
75 results.write(f'{mail[0]}\n')
76 except Exception as errorfile:
77 print(errorfile)
78 pass
79 try:
80 results2.write(f'=' * 15 + '\n')
81 results2.write(f'subs - {subi}, email - {mail[0]}, url = {channel_url}\n')
82 results2.write(f'=' * 15 + '\n')
83 results2.write('\n')
84 except Exception as errorfile2:
85 print(errorfile2)
86 continue
87 try:
88 results3.write(f'{mail[0]}\n')
89 except Exception as error3:
90 print(error3)
91 pass
92 results.close()
93 results2.close()
94 results3.close()
95
96 except Exception as err:
97 print(err)
98 time.sleep(0.8)
99 subi = 0
100 channel_url = ''
101 continue
102
103 def get_words(self):
104 words = []
105 r = requests.get('https://jimpix.co.uk/generators/word-generator.asp?go=yes&ul1=0&chars=0&match=0&numwords=25&aplha=0&lk=&lki=lki1')
106 soup = bs(r.content, 'html.parser')
107 first_step = soup.find('ul', attrs={'class':'list-unstyled'}).find_all_next('a', attrs={'style':'color:white;'})
108 for i in first_step:
109 words.append(i.text)
110 return words
111
112
113if __name__ == '__main__':
114 parse = Parser()
115 print('Выберите режим работы: 1 - Автоматическая генерация слов и поиск почт, 2 - Загрузка слов из requests.txt [НЕ ЗАБУДЬТЕ СОЗДАТЬ файл requests.txt и записать туда слова для поиска, 3 - Бесконечный парсинг почт с ютуба')
116 mode = input('')
117 if mode == "1":
118 while True:
119 try:
120 print('Генерирую ключевые слова для поиска')
121 request = parse.get_words()
122 print(f'Удалось сгенерировать {len(request)} слов')
123 break
124 except:
125 continue
126 elif mode == "2":
127 print('Загружаю ключевые слова из requests.txt...')
128 try:
129 f = open('requests.txt', encoding='utf-8')
130 lines = f.readlines()
131 num_lines = sum(1 for line in open('requests.txt', encoding='utf-8'))
132 for i in range(num_lines):
133 request.append(lines[i].replace('\n',''))
134 except:
135 print("Ошибка, возможно вы не создали файл requests.txt")
136 elif mode =="3":
137 print('Выбран режим бесконечного парсинга почт с ютуба...')
138 while True:
139 try:
140 request = parse.get_words()
141 print(request)
142 print(f'Удалось сгенерировать {len(request)} слов')
143 print('Собираю ссылки для парсинга...')
144 urls = parse.get_urls_to_pars(request)
145 print(f' собрал {len(urls)} ссылок')
146 print('Начинаю искать почты...')
147 THREADS = len(urls) // 100
148 for index in range(THREADS):
149 threading.Thread(target=parse.pars_mails, args=(urls, start, end)).start()
150 start += 100
151 end += 100
152 threading.Thread(target=parse.pars_mails, args=(urls, start, len(urls))).start()
153 start = 0
154 end = 100
155 time.sleep(10)
156 except Exception as error:
157 print(error)
158 time.sleep(3)
159 start = 0
160 end = 100
161 continue
162
163 print(request)
164 print('Собираю ссылки для парсинга...')
165 urls = parse.get_urls_to_pars(request)
166 print(f' собрал {len(urls)} ссылок')
167 print('Начинаю искать почты...')
168 THREADS = len(urls)//100
169 for index in range(THREADS):
170 threading.Thread(target=parse.pars_mails, args=(urls, start, end)).start()
171 start += 100
172 end += 100
173 threading.Thread(target=parse.pars_mails, args=(urls, start, len(urls))).start()