CDz97JdU

· 5 years ago · Aug 30, 2020, 06:18 AM
1import csv, os, time, lxml
2from concurrent.futures.process import ProcessPoolExecutor
3
4from selenium import webdriver
5from seleniumrequests import Chrome
6from selenium.webdriver.chrome.options import Options
7from bs4 import BeautifulSoup
8from threading import Thread
9
10options = Options()
11
12
13options.headless = False
14PROXY = True
15THREADS = 2
16API_KEY = '33959fa3837c3b7504d93b3bfdcb242a'
17
18options.add_argument("--log-level=3")
19options.add_argument("--no-sandbox")
20options.add_argument("--disable-gpu")
21
22# Your API KEY here
23
24
25headers = {
26    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.0 Safari/537.36)',
27    'Cookie': 'session-id=262-8803972-3996461; session-id-time=2082787201l; i18n-prefs=INR; ubid-acbin=261-0743468-3807160; session-token=fXut7Q+tWl2a9Kh6+v0TDRH4FoBgAog3qGuLVvMiQwfXKMV1QDHxMmsU0zLmnPJHvWlqb86L11csJpC5xnALggHDAsoNNJSdyJ3889Hi2ahmOLljtqU0MyLfzTjmPMJEqdsLuo5yNVlSavwI9l2UfrLF8AChbdTphnXmXMTJT73YljLsvBiRy2EU/9fyL0JQ'
28}
29
30
31def check_url(url):
32    url = url['link']
33    for i in range(0, 5):
34        webdriver = Chrome(options=options)
35        try:
36            if PROXY:
37                url = "http://api.scraperapi.com?api_key=" + API_KEY + "&url=" + url
38            response = webdriver.request('GET', url)
39            soup = BeautifulSoup(response.text, 'lxml')
40            if response.status_code == 200:
41                try:
42                    title = soup.find('span', {'id': 'productTitle'}).text.replace('\n', '')
43                    webdriver.quit()
44                    return url, response.status_code, title, True
45                except Exception as e:
46                    if soup.find('title').text == 'Robot Check':
47                        print('Robot check found sleeping')
48                        time.sleep(5)
49                        webdriver.quit()
50                        continue
51            else:
52                webdriver.quit()
53                return url, response.status_code, '', True
54        except:
55            webdriver.quit()
56            print('Retrying')
57    webdriver.quit()
58    return url, 'failed', '', False
59
60
61def read_csv(path):
62    try:
63        with open(path, 'r') as file:
64            links = []
65            reader = list(csv.reader(file))
66            for row in reader[1:]:
67                links.append(row[0])
68        print(len(links), ' Links found')
69        return links
70    except FileNotFoundError:
71        print('No such file or directory')
72        return []
73
74
75def write_csv(data):
76    with open(outfile, 'a', newline='', encoding='utf-8') as csvfile:
77        # creating a csv writer object
78        csvwriter = csv.writer(csvfile)
79        csvwriter.writerow(data)
80
81
82
83if __name__ == '__main__':
84    # file operations
85
86    # input file path
87    input_file = 'data.csv'
88
89    filename = 'output-thread-' + os.path.basename(input_file)
90
91    # output directory
92    dir_name = 'out'
93    outfile = os.path.join(dir_name, filename)
94    links = read_csv(input_file)
95
96    pro_links = read_csv(outfile)
97
98    # writing the fields
99    if len(pro_links) == 0:
100        write_csv(['URL', 'Status Code', 'Verfied', 'Product Title'])
101
102    thread_links = []
103
104    for index, link in enumerate(links):
105        if link in pro_links:
106            continue
107
108        if index % THREADS != 0:
109            thread_links.append({
110                'link': link,
111                'index': index
112            })
113
114        else:
115            thread_links.append({
116                'link': link,
117                'index': index
118            })
119            with ProcessPoolExecutor() as e:
120                results = e.map(check_url, thread_links)
121
122            for thread_link,data in zip(thread_links,results):
123                print(data)
124                print(thread_link['index'] + 1, ' ', thread_link['link'], ' ', data[1], ' ', data[3], ' ', data[2])
125                write_csv([thread_link['link'], data[1], data[3], data[2]])
126
127            thread_links = []
128
129
130
131
132
133
134