· 5 years ago · Aug 30, 2020, 06:18 AM
1import csv, os, time, lxml
2from concurrent.futures.process import ProcessPoolExecutor
3
4from selenium import webdriver
5from seleniumrequests import Chrome
6from selenium.webdriver.chrome.options import Options
7from bs4 import BeautifulSoup
8from threading import Thread
9
10options = Options()
11
12
13options.headless = False
14PROXY = True
15THREADS = 2
16API_KEY = '33959fa3837c3b7504d93b3bfdcb242a'
17
18options.add_argument("--log-level=3")
19options.add_argument("--no-sandbox")
20options.add_argument("--disable-gpu")
21
22# Your API KEY here
23
24
25headers = {
26 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.0 Safari/537.36)',
27 'Cookie': 'session-id=262-8803972-3996461; session-id-time=2082787201l; i18n-prefs=INR; ubid-acbin=261-0743468-3807160; session-token=fXut7Q+tWl2a9Kh6+v0TDRH4FoBgAog3qGuLVvMiQwfXKMV1QDHxMmsU0zLmnPJHvWlqb86L11csJpC5xnALggHDAsoNNJSdyJ3889Hi2ahmOLljtqU0MyLfzTjmPMJEqdsLuo5yNVlSavwI9l2UfrLF8AChbdTphnXmXMTJT73YljLsvBiRy2EU/9fyL0JQ'
28}
29
30
31def check_url(url):
32 url = url['link']
33 for i in range(0, 5):
34 webdriver = Chrome(options=options)
35 try:
36 if PROXY:
37 url = "http://api.scraperapi.com?api_key=" + API_KEY + "&url=" + url
38 response = webdriver.request('GET', url)
39 soup = BeautifulSoup(response.text, 'lxml')
40 if response.status_code == 200:
41 try:
42 title = soup.find('span', {'id': 'productTitle'}).text.replace('\n', '')
43 webdriver.quit()
44 return url, response.status_code, title, True
45 except Exception as e:
46 if soup.find('title').text == 'Robot Check':
47 print('Robot check found sleeping')
48 time.sleep(5)
49 webdriver.quit()
50 continue
51 else:
52 webdriver.quit()
53 return url, response.status_code, '', True
54 except:
55 webdriver.quit()
56 print('Retrying')
57 webdriver.quit()
58 return url, 'failed', '', False
59
60
61def read_csv(path):
62 try:
63 with open(path, 'r') as file:
64 links = []
65 reader = list(csv.reader(file))
66 for row in reader[1:]:
67 links.append(row[0])
68 print(len(links), ' Links found')
69 return links
70 except FileNotFoundError:
71 print('No such file or directory')
72 return []
73
74
75def write_csv(data):
76 with open(outfile, 'a', newline='', encoding='utf-8') as csvfile:
77 # creating a csv writer object
78 csvwriter = csv.writer(csvfile)
79 csvwriter.writerow(data)
80
81
82
83if __name__ == '__main__':
84 # file operations
85
86 # input file path
87 input_file = 'data.csv'
88
89 filename = 'output-thread-' + os.path.basename(input_file)
90
91 # output directory
92 dir_name = 'out'
93 outfile = os.path.join(dir_name, filename)
94 links = read_csv(input_file)
95
96 pro_links = read_csv(outfile)
97
98 # writing the fields
99 if len(pro_links) == 0:
100 write_csv(['URL', 'Status Code', 'Verfied', 'Product Title'])
101
102 thread_links = []
103
104 for index, link in enumerate(links):
105 if link in pro_links:
106 continue
107
108 if index % THREADS != 0:
109 thread_links.append({
110 'link': link,
111 'index': index
112 })
113
114 else:
115 thread_links.append({
116 'link': link,
117 'index': index
118 })
119 with ProcessPoolExecutor() as e:
120 results = e.map(check_url, thread_links)
121
122 for thread_link,data in zip(thread_links,results):
123 print(data)
124 print(thread_link['index'] + 1, ' ', thread_link['link'], ' ', data[1], ' ', data[3], ' ', data[2])
125 write_csv([thread_link['link'], data[1], data[3], data[2]])
126
127 thread_links = []
128
129
130
131
132
133
134