· 6 years ago · Jan 27, 2019, 01:42 PM
1# -*- coding: utf-8 -*-
2import os
3import virustotal
4import requests
5import urllib
6import hashlib
7import json
8from time import sleep, time
9from socket import gethostbyname
10from bs4 import BeautifulSoup
11from selenium import webdriver
12from selenium.webdriver.common.keys import Keys
13from clint.textui.colored import green, red
14import traceback
15import check_virustotal_api
16import threading
17white_list = [
18 'www.wikipedia.org', 'vk.com', 'www.twitter.com', 'www.facebook.com', 'www.instagram.com', 'www.youtube.com', 'www.yandex.ru', 'www.linkedin.com',
19 'my.mail.ru', 'mail.ru', 'www.odnoklassniki.ru', 'plus.google.com', 'google.com', 'www.livejournal.com', 'www.blogger.com', 'moikrug.ru',
20 'blog.friendfeed.com', 'www.pinterest.com', 'www.tumblr.com', 'xakep.ru', 'forum.xakep.ru', 'statcounter.com', 'yandex.ru', 'en.wikipedia.org',
21 'ru.wikipedia.org', 'www.solvusoft.com', 'microsoft.com', 'twitter.com', 'telegram.me', 'en.wikipedia.ru', 'r.mail.ru', 'ru.wikihow.com',
22]
23
24
25# Какого формата иÑкать файлы
26FILE_FORMATS = ['.exe', '.msi', '.rtf', '.doc', '.docx', '.xls', '.ppt', '.pptx', '.pdf', '.swf', '.jar', '.apk','.zip']
27
28pathToDir="\\works"
29
30# Сколько парÑить Ñтраниц
31PAGES = 1
32
33# Глубина рекурÑивного поиÑка по ÑÑылкам
34DEPTH = 0
35
36API_KEY = "376460aed682390439b0c2c2f9b7922327a60652bdb51b1cc75d03afc76a6934"
37
38counter = {"globalVirusCounter" : 0, "globalSites" : 0, "globalRecursiveSites" : 0, "globalDownloaded" : 0,\
39'.exe' : 0, '.msi' : 0, '.rtf' : 0, '.doc' : 0, '.docx' : 0, '.xls' : 0, '.ppt' : 0,\
40'.pptx' : 0, '.pdf' : 0, '.swf' : 0, '.jar' : 0, '.apk' : 0, '.apk' : 0,'.zip' : 0}
41
42def findLinksToDownloadFile(link, browser):
43
44 linksToFile = []
45
46 html = getHtml(link, browser)
47 soup = BeautifulSoup(html, 'html.parser')
48 items = soup.find_all('a', href=True)
49
50 for item in items:
51 linkToFile = item.get('href')
52
53 if linkToFile.find('.html') == -1:
54 for fileFormat in FILE_FORMATS:
55 if linkToFile.find(fileFormat) != -1:
56
57 if linkToFile.find("http") == -1:
58 startIndex = link.find("://") + 3
59 endIndex = link.find("/", startIndex, len(link))
60 startIndex = 0
61 root = link[startIndex : endIndex]
62 linksToFile.append(root + linkToFile)
63 else:
64 linksToFile.append(linkToFile)
65
66 return linksToFile
67
68
69
70# Считаем Ð¥ÐШ, получаем репрот Ñ VT, запиÑываем его (еÑли вируÑ, вернетÑÑ 1, иначе 0)
71def makeReport(virusTotal, pathToFile):
72 fileHash = calculateFileHash(pathToFile)
73 report = checkFileByHash(virusTotal, fileHash)
74 #with open(pathToFile + '_VIRUS_TOTAL_REPORT.txt', 'w') as file:
75 # file.write(report)
76
77 # ЕÑли нашли true, то Ñто вируÑ
78 if report == 1:
79 print(red("[+] MALWARE!"))
80 return 1
81
82 print(green("[-] No"))
83 return 0
84
85
86
87
88
89
90
91
92
93
94
95# ЗапуÑкаем браузер phantomjs (не имеет диÑплеÑ), гуглим нужный запроÑ, ÑохранÑем html
96def getHtml(link, browser):
97 try:
98 browser.get(link)
99 except:
100 return -1
101 html = browser.page_source
102 return html
103
104
105# Получаем ÑÑылки по запроÑу Ñ pages Ñтраниц
106def getLinksFromRequest(request, pages, browser):
107 linksList = []
108 for i in range(pages):
109 html = getHtml(request + '&p={0}'.format(i), browser)
110 if html==-1:
111 return -1
112 soup = BeautifulSoup(html, 'html.parser')
113 items = soup.findAll('li', class_ = 'serp-item') #for mail
114 if(items == []):
115 print("Error requests for {0}".format(request))
116 for item in items:
117 try:
118 link = item.div.h2.a.get('href')
119 except:
120 continue
121 if(str(link).find("yandex.ru") == -1):
122 linksList.append(link)
123 return linksList
124
125
126
127def readRequestsFromFile(fileName):
128 linksList = []
129 requestsList = []
130 # Считали Ñтроки
131 with open(fileName, 'r') as file:
132 for line in file:
133 line = line.replace('\n', '')
134 requestsList.append(line)
135 # Преобразовали их
136 for request in requestsList:
137 linksList.append(('https://yandex.ru/search/?lr=2&text=' + request).replace(' ', "%20").replace('\n', ''))
138 return linksList, requestsList
139
140# Скачивает файл по ÑÑылке, возвращает полный путь до него, но не ÑохранÑет его
141def downloadFile(linkToFile, dir):
142 #remove last symbol if it /
143 if linkToFile[len(linkToFile) - 1] == '/':
144 linkToFile = linkToFile[:len(linkToFile) - 1]
145
146 # Сформировали Ð¸Ð¼Ñ Ð¸ Ñкачали
147 startIndex = linkToFile.rfind("/") + 1
148 endIndex = len(linkToFile)
149 fileName = linkToFile[startIndex : endIndex]
150
151 try:
152 yourstring=fileName
153 yourstring = yourstring.encode('ascii', 'ignore').decode('ascii')
154 print("\t[*] Loading: {0}: ".format(yourstring))
155 yourstring2=linkToFile
156 yourstring2 = yourstring2.encode('ascii', 'ignore').decode('ascii')
157 print("\t[*] Link To File: {0}".format(yourstring2))
158 data = requests.get(linkToFile)
159 with open(dir + '/' + fileName, 'wb') as file:
160 file.write(data.content)
161 print(green("[+] YEP"))
162 # Счетчик Ñкачанных и Ñчетчик форматов
163 #counter['globalDownloaded'] += 1
164 #for fileFormat in FILE_FORMATS:
165 # if linkToFile.find(fileFormat) != -1:
166 # counter[fileFormat] += 1
167
168 except:
169 traceback.print_exc()
170 print(red("[-] Bad with loading!!!"))
171 return ""
172 return dir + '/' + fileName
173
174
175# СоздаетÑÑ Ð´Ð¸Ñ€ÐµÐºÑ‚Ð¾Ñ€Ð¸Ñ Ð¸ файлик Ñ Ð½Ð°Ð·Ð²Ð°Ð½Ð¸ÐµÐ¼ Ñайта и ip и ÑÑылками Ñайта
176def createDirForSite(link, linksToAnotherSites):
177 # Получили корень ÑÑылки (Ð´Ð»Ñ Ð¾Ð¿Ñ€ÐµÐ´ÐµÐ»ÐµÐ½Ð¸Ñ ip)
178 startIndex = link.find("://") + 3
179 endIndex = link.find("/", startIndex, len(link))
180 root = link[startIndex : endIndex]
181
182 # Убрали запрещенные Ñимволы, Ñоздали папку Ð´Ð»Ñ Ñайта из полного адреÑа
183 name = link[startIndex : len(link)].replace('/', "_").replace('?', '-').replace(' ', '')
184 pathToDir = "files/" + name
185 # ЕÑли Ð¸Ð¼Ñ Ñлишком большое
186 if len(pathToDir) > 259:
187 pathToDir = pathToDir[: 259]
188 if not os.path.exists(pathToDir):
189 os.makedirs(pathToDir)
190
191 # Создали папку, получили ip, запиÑали ÑÑылку и ip
192 try:
193 ip = gethostbyname(root)
194 except:
195 print(root)
196 return pathToDir
197
198 with open(pathToDir + '/info.txt', 'w') as infoFile:
199 infoFile.write("Url: {0}".format(link + '\n'))
200 infoFile.write("IP: {0}".format(ip + '\n'))
201 infoFile.write("Link to another sites:\n")
202 for link in linksToAnotherSites:
203 try:
204 infoFile.write(link + '\n')
205 except:
206 continue
207 return pathToDir
208
209
210# Возвращает ÑпиÑок ÑÑылок на Ñторонние Ñайты
211def findLinksToAnotherSites(link, browser):
212 linksToAnotherSites = []
213 html = getHtml(link, browser)
214 soup = BeautifulSoup(html, 'html.parser')
215 # Получаем ÑÑылки
216 for link in soup.findAll('a', href=True):
217 link_clear = link.get('href')
218 # Что бы избавитьÑÑ Ð¾Ñ‚ каких-то левых решеток
219 if link_clear.find("http") != -1:
220 linksToAnotherSites.append(link_clear)
221 return linksToAnotherSites
222
223
224
225# Поик ÑÑылок, загрузка и проверка файлов
226def research(link, browser, virusTotal, level,my_paths):
227
228 yourstring=link
229 yourstring = yourstring.encode('ascii', 'ignore').decode('ascii')
230 #proc = os.getpid()
231 #print("[*] Site: {0} ".format(yourstring))
232
233 #print("[*] ПоиÑк ÑÑылок на Ñайте {0}".format(link))
234 linksToAnotherSites = findLinksToAnotherSites(link, browser)
235
236 # Ищем ÑÑылки на файлы
237 #print("[*] ПоиÑк файлов на Ñайте {0}".format(link))
238 linksToFile = findLinksToDownloadFile(link, browser)
239
240 # ЕÑли на Ñайте еÑть файлы, мы их выводим и Ñкачиваем, Ñчитаем Ð¥ÐШ, получаем инфу
241 #threadLock.acquire()
242 #print("[*] Site: {0}\n[*] Link to site: {1}\n\t[*] Link to file: {2} ".format(yourstring,len(linksToAnotherSites),len(linksToFile)))
243 #threadLock.release()
244 #print("\t[*] Link to site: {0}\n\t[*] Link to file: {1}".format(len(linksToAnotherSites), len(linksToFile)))
245 if len(linksToFile) > 0:
246 for linkToFile in linksToFile:
247 #threadLock.acquire()
248 pathToFile = downloadFile(linkToFile, my_paths)
249 #threadLock.release()
250 #if pathToFile != "":
251 # counter['globalDownloaded'] += 1
252
253 #print(level)
254 #print(linksToAnotherSites)
255 if level < DEPTH:
256 for localLink in linksToAnotherSites:
257 #counter['globalRecursiveSites'] += 1
258 #print("\n[!] Level: {0}".format(level + 1))
259 every_true=True
260 for white in white_list:
261 if link.find(white) != -1:
262 #print ("[***] Miss it: {0}".format(link))
263 every_true=False
264 if every_true==True:
265 research(localLink, browser, virusTotal, level + 1,my_paths)
266 if level==0:
267 startIndex = link.find("://") + 3
268 endIndex = link.find("/", startIndex, len(link))
269 root = link[startIndex : endIndex]
270 try:
271 ip = gethostbyname(root)
272 except:
273 print ("[***] Exception with gethostbyname %s".format(root))
274 ip="0.0.0.0"
275 name = link[startIndex : len(link)].replace('/', "_").replace('?', '-').replace(' ', '')
276 if name > 80:
277 name=name[: 79]
278 #print ("ERerwrwrwre {0}".format(name))
279 with open(my_paths + "\\" + name + ".txt", 'w') as infoFile:
280 infoFile.write("Url: {0}".format(link + '\n'))
281 infoFile.write("IP: {0}".format(ip + '\n'))
282 infoFile.write("Link to another sites:{0} \n".format(len(linksToAnotherSites)))
283 infoFile.write("Link to file:{0} \n".format(len(linksToFile)))
284 for linkToFile in linksToFile:
285 #print ("{0}\n".format(linkToFile))
286 infoFile.write("{0}\n".format(linkToFile))
287def main():
288 paths = os.getcwd()
289 paths = paths + "\\works"
290 if not os.path.exists(paths):
291 os.makedirs(paths)
292 linksList = []
293 procs = []
294 threadLock = threading.Lock()
295 start_time = time()
296 # Virus total init
297 virusTotal = virustotal.VirusTotal(API_KEY)
298 print (green('Sobol S.S.'))
299 # Открыли браузер
300 browser = webdriver.PhantomJS("C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe")
301 # Считали запроÑÑ‹ и получили выдачу по ним, затем вывели их (получили выдачу в виде ÑÑылок и дефолтные запроÑÑ‹ Ð´Ð»Ñ Ð¿Ñ€Ð¸Ð½Ñ‚Ð°)
302 requestsList, requests = readRequestsFromFile('requests.txt')
303 for i in range(len(requestsList)):
304 localLinksList = getLinksFromRequest(requestsList[i], PAGES, browser)
305 if localLinksList==-1:
306 continue
307 linksList += localLinksList
308 print("Links in request: \"{0}\". Count pages: {1}".format(requests[i], PAGES))
309 for j in range(len(localLinksList)):
310 print("{0}. {1}".format(j + 1, localLinksList[j]))
311 print("")
312 #print (len(linksList))
313 # По полученным Ñайтам ищем ÑÑылки на файлы и ÑÑылки на другие Ñайты
314 for link in linksList:
315 #counter['globalSites'] += 1
316 every_true=True
317 for white in white_list:
318 if link.find(white) != -1:
319 print ("[***] Miss it: {0}".format(link))
320 every_true=False
321 if every_true==True:
322 proc = threading.Thread(target=research, args=(link,browser,virusTotal,0,paths))
323 #research(link, browser, virusTotal, 0,paths)
324 procs.append(proc)
325 proc.start()
326
327
328 for proc in procs:
329 proc.join()
330 # Ð¤Ð¸Ð½Ð°Ð»ÑŒÐ½Ð°Ñ ÑтатиÑтика
331 difference = (int(time() - start_time))
332 with open('statistics.txt', 'w') as file:
333 # file.write("Sites: {0} Recursive: {1}\nAll files: {2} Malware: {3}".format(counter['globalSites'],counter['globalRecursiveSites'], counter['globalDownloaded'], counter['globalVirusCounter']))
334 # file.write("\nTime of work: {0}\n".format(str(difference // 60) + ':' + str(difference % 60).zfill(2)))
335 # file.write('.exe:\t{0}\n.msi:\t{1}\n.rtf:\t{2}\n.doc:\t{3}\n.docx:\t{4}\n.xls:\t{5}\n.ppt:\t{6}\n.pptx:\t{7}\n.pdf:\t{8}\n.swf:\t{9}\n.jar:\t{10}\n.apk: \t{11}'.format(counter['.exe'], counter['.msi'],counter['.rtf'], counter['.doc'], counter['.docx'], counter['.xls'], counter['.ppt'], counter['.pptx'], counter['.pdf'], counter['.swf'], counter['.jar'], counter['.apk']))
336 # print ("Sites: {0} Recursive: {1}\nAll files: {2} Malware: {3}".format(counter['globalSites'],counter['globalRecursiveSites'], counter['globalDownloaded'], counter['globalVirusCounter']))
337 # print("\nTime of work: {0}\n".format(str(difference // 60) + ':' + str(difference % 60).zfill(2)))
338 #print('.exe:\t{0}\n.msi:\t{1}\n.rtf:\t{2}\n.doc:\t{3}\n.docx:\t{4}\n.xls:\t{5}\n.ppt:\t{6}\n.pptx:\t{7}\n.pdf:\t{8}\n.swf:\t{9}\n.jar:\t{10}\n.apk: \t{11}'.format(counter['.exe'], counter['.msi'],counter['.rtf'], counter['.doc'], counter['.docx'], counter['.xls'], counter['.ppt'], counter['.pptx'], counter['.pdf'], counter['.swf'], counter['.jar'], counter['.apk']))
339 files = os.listdir(paths)
340 #print (len(files))
341 print("[*] Time of work: {0}\n".format(str(difference // 60) + ':' + str(difference % 60).zfill(2)))
342 images = filter(lambda x: x.endswith('.txt'), files)
343 print ("[*] All sites: {0}\n".format(len(images)))
344 images_not_txt = filter(lambda x: not x.endswith('.txt'), files)
345 print("[*] Download files: {0}\n".format(len(images_not_txt)))
346 browser.close()
347
348main()