· 5 years ago · Sep 27, 2020, 04:00 PM
1import requests
2from bs4 import BeautifulSoup
3import csv
4import os
5import time
6import json
7import random
8import sys
9import pandas as pd
10
11import requests
12from bs4 import BeautifulSoup
13from fake_useragent import UserAgent
14from random import shuffle
15
16try:
17
18 import requests
19 from bs4 import BeautifulSoup
20 import random
21
22except:
23 print(" Library Not Found !")
24
25list_proxies = [
26 "103.250.68.10:8080",
27 "178.33.251.230:3129",
28 "88.99.149.188:31288",
29 "51.75.147.40:3128",
30 "125.27.251.124:45861",
31 "51.254.237.77:3129",
32 "104.248.63.49:31583",
33 "125.26.99.185:36525",
34 "183.164.227.165:4216",
35 "180.109.124.30:4216",
36 "103.31.251.18:8080",
37 "110.44.133.135:3128",
38 "175.100.5.52:32721",
39 "182.72.150.242:8080",
40 "51.75.147.44:3128",
41 "78.96.125.24:3128",
42 "176.56.107.214:52184",
43 "125.26.99.186:41358",
44 "217.172.170.116:3838",
45 "62.210.177.105:3128",
46 "46.225.241.66:3128",
47 "180.211.183.178:60604",
48 "116.212.129.58:59557",
49 "189.195.162.242:8080",
50 "165.22.64.68:33874",
51 "186.226.172.165:57783",
52 "43.248.24.157:51166",
53 "78.96.125.24:3128",
54 "1.20.102.102:38816",
55 "118.174.220.11:60148",
56 "195.154.232.38:3838",
57 "5.202.188.154:3128",
58 "46.151.108.6:41171",
59 "119.82.252.29:46872",
60 "117.6.161.118:53281",
61 "104.248.63.49:31583",
62 "1.20.103.196:42792",
63 "182.72.150.242:8080",
64 "108.163.66.164:8080",
65 "103.117.195.224:8686"
66]
67
68_headers = {
69 'Accept-Encoding': 'gzip, deflate, sdch',
70 'Accept-Language': 'en-US,en;q=0.8',
71 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
72 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
73 'Referer': 'http://www.wikipedia.org/',
74 'Connection': 'keep-alive',
75}
76
77def get_proxy_function():
78 return {"https": list_proxies[0]}
79
80def delete_proxy():
81 if len(list_proxies) > 0:
82 list_proxies.remove(list_proxies[0])
83
84def Proxy_Request(request_type='get', url='', **kwargs):
85 """
86
87 :param request_type: GET, POST, PUT
88 :param url: URL from which you want to do webscrapping
89 :param kwargs: any other parameter you pass
90 :return: Return Response
91 """
92 while True:
93 try:
94 proxy = get_proxy_function()
95 print("Using Proxy {}".format(proxy))
96 r = requests.request(request_type, url, proxies=proxy, headers=_headers, timeout=8, **kwargs)
97 return r
98 break
99 except:
100 pass
101
102
103# class Random_Proxy(object):
104#
105# def __init__(self):
106# self.__url = 'https://www.sslproxies.org/'
107# self.__headers = {
108# 'Accept-Encoding': 'gzip, deflate, sdch',
109# 'Accept-Language': 'en-US,en;q=0.8',
110# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
111# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
112# 'Referer': 'http://www.wikipedia.org/',
113# 'Connection': 'keep-alive',
114# }
115# self.random_ip = []
116# self.random_port = []
117#
118# def __random_proxy(self):
119#
120# """
121# This is Private Function Client Should not have accesss
122# :return: Dictionary object of Random proxy and port number
123# """
124#
125# r = requests.get(url=self.__url, headers=self.__headers)
126# soup = BeautifulSoup(r.text, 'html.parser')
127#
128# # Get the Random IP Address
129# for x in soup.findAll('td')[::8]:
130# self.random_ip.append(x.get_text())
131#
132# # Get Their Port
133# for y in soup.findAll('td')[1::8]:
134# self.random_port.append(y.get_text())
135#
136# # Zip together
137# z = list(zip(self.random_ip, self.random_port))
138#
139# # This will Fetch Random IP Address and corresponding PORT Number
140# number = random.randint(0, len(z) - 50)
141# ip_random = z[number]
142#
143# # convert Tuple into String and formart IP and PORT Address
144# ip_random_string = "{}:{}".format(ip_random[0], ip_random[1])
145#
146# # Create a Proxy
147# proxy = {'https': ip_random_string}
148#
149# # return Proxy
150# return proxy
151#
152# def Proxy_Request(self, request_type='get', url='', **kwargs):
153# """
154#
155# :param request_type: GET, POST, PUT
156# :param url: URL from which you want to do webscrapping
157# :param kwargs: any other parameter you pass
158# :return: Return Response
159# """
160# while True:
161# try:
162# proxy = get_proxy_function()
163# print("Using Proxy {}".format(proxy))
164# r = requests.request(request_type, url, proxies=proxy, headers=self.__headers, timeout=8, **kwargs)
165# return r
166# break
167# except:
168# pass
169
170
171# Class gsmarena scrap the website phones models and its devices and save to csv file individually.
172class Gsmarena():
173
174 # Constructor to initialize common useful varibales throughout the program.
175 def __init__(self):
176 self.phones = []
177 self.features = ["Brand", "Model Name", "Model Image"]
178 self.temp1 = []
179 self.phones_brands = []
180 self.url = 'https://www.gsmarena.com/' # GSMArena website url
181 self.new_folder_name = 'GSMArenaDataset' # Folder name on which files going to save.
182 self.absolute_path = os.popen(
183 'cd').read().strip() + '/' + self.new_folder_name # It create the absolute path of the GSMArenaDataset folder.
184
185 # This function crawl the html code of the requested URL.
186 def crawl_html_page(self, sub_url):
187
188 url = self.url + sub_url # Url for html content parsing.
189
190 # Handing the connection error of the url.
191 try:
192 request_type = "get"
193
194 r = Proxy_Request(url=url, request_type=request_type)
195
196 # Set the headers
197 soup = BeautifulSoup(r.content, 'html.parser') # It parses the html data from requested url.
198 title = soup.find('title')
199 if title == "Too Many Requests":
200 delete_proxy()
201 self.crawl_html_page(sub_url)
202 else:
203 return soup
204
205 except ConnectionError as err:
206 print("Please check your network connection and re-run the script.")
207 exit()
208
209 except Exception:
210 print("Please check your network connection and re-run the script.")
211 exit()
212
213 # This function crawl mobile phones brands and return the list of the brands.
214 def crawl_phone_brands(self):
215 phones_brands = []
216 soup = self.crawl_html_page('makers.php3')
217 table = soup.find_all('table')[0]
218 table_a = table.find_all('a')
219 for a in table_a:
220 temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
221 phones_brands.append(temp)
222 return phones_brands
223
224 # This function crawl mobile phones brands models links and return the list of the links.
225 def crawl_phones_models(self, phone_brand_link):
226 links = []
227 nav_link = []
228 soup = self.crawl_html_page(phone_brand_link)
229 nav_data = soup.find(class_='nav-pages')
230 if not nav_data:
231 nav_link.append(phone_brand_link)
232 else:
233 nav_link = nav_data.findAll('a')
234 nav_link = [link['href'] for link in nav_link]
235 nav_link.append(phone_brand_link)
236 nav_link.insert(0, nav_link.pop())
237 for link in nav_link:
238 soup = self.crawl_html_page(link)
239 data = soup.find(class_='section-body')
240 for line1 in data.findAll('a'):
241 links.append(line1['href'])
242
243 return links
244
245 # This function crawl mobile phones specification and return the list of the all devices list of single brand.
246 def crawl_phones_models_specification(self, link, phone_brand):
247 phone_data = {}
248 try:
249 soup = self.crawl_html_page(link)
250 model_name = soup.find(class_='specs-phone-name-title').text
251 model_img_html = soup.find(class_='specs-photo-main')
252 model_img = model_img_html.find('img')['src']
253 phone_data.update({"Brand": phone_brand})
254 phone_data.update({"Model Name": model_name})
255 phone_data.update({"Model Image": model_img})
256 temp = []
257 for data1 in range(len(soup.findAll('table'))):
258 table = soup.findAll('table')[data1]
259 for line in table.findAll('tr'):
260 temp = []
261 for l in line.findAll('td'):
262 text = l.getText()
263 text = text.strip()
264 text = text.lstrip()
265 text = text.rstrip()
266 text = text.replace("\n", "")
267 temp.append(text)
268 if temp[0] in phone_data.keys():
269 temp[0] = temp[0] + '_1'
270 if temp[0] not in self.features:
271 self.features.append(temp[0])
272 if not temp:
273 continue
274 else:
275 phone_data.update({temp[0]: temp[1]})
276 except:
277 print("exception tamu ima")
278 return phone_data
279
280 return phone_data
281
282 # This function create the folder 'GSMArenaDataset'.
283 def create_folder(self):
284 if not os.path.exists(self.new_folder_name):
285 os.system('mkdir ' + self.new_folder_name)
286 print("Creating ", self.new_folder_name, " Folder....")
287 time.sleep(6)
288 print("Folder Created.")
289 else:
290 print(self.new_folder_name, "directory already exists")
291
292 # This function check the csv file exists in the 'GSMArenaDataset' directory or not.
293 def check_file_exists(self):
294 return os.listdir(self.absolute_path)
295
296 # This function save the devices specification to csv file.
297 def save_specification_to_file(self):
298 phone_brand = self.crawl_phone_brands()
299 self.create_folder()
300 files_list = self.check_file_exists()
301 for brand in phone_brand:
302 phones_data = []
303 if (brand[0].title() + '.csv') not in files_list:
304 link = self.crawl_phones_models(brand[2])
305 model_value = 1
306 print("Working on", brand[0].title(), "brand.")
307 for value in link:
308 datum = self.crawl_phones_models_specification(value, brand[0])
309 datum = {k: v.replace('\n', ' ').replace('\r', ' ') for k, v in datum.items()}
310 phones_data.append(datum)
311 print("Completed ", model_value, "/", len(link))
312 model_value += 1
313 with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w", encoding='utf8') as file:
314 dict_writer = csv.DictWriter(file, fieldnames=self.features)
315 dict_writer.writeheader()
316 str_phones_data = json.dumps(phones_data)
317 encoded = str_phones_data.encode('utf-8')
318 load_list = json.loads(encoded)
319 for dicti in load_list:
320 dict_writer.writerow({k: v for k, v in dicti.items()})
321 print("Data loaded in the file")
322 else:
323 print(brand[0].title() + '.csv file already in your directory.')
324 if (brand == phone_brand[-1]):
325 print('Crawling Data finished. Stored in the Dataset Folder. \n')
326 exit()
327
328
329i = 1
330
331
332def output_csv():
333 try:
334 while i == 1:
335 if __name__ == "__main__":
336 obj = Gsmarena()
337 obj.save_specification_to_file()
338 except KeyboardInterrupt:
339 print("File has been stopped due to KeyBoard Interruption.")
340
341
342def search_csv():
343 # input csv name you want to search
344 brand = input('Enter brand you want to search\n')
345 file_name = './GSMArenaDataset/' + brand + '.csv'
346 # read csv, and split on "," the line
347 r_file = csv.DictReader(open(file_name, "r", encoding='utf8'), delimiter=",")
348
349 device_name = brand + ' ' + input('Enter Device name\n')
350 print('searching for device named ' + device_name)
351
352 a = 0
353
354 with open('Search_Result.csv', 'w', encoding='utf8') as w_file:
355 # r_file = csv.Reader(r)
356 # trans_buff = []
357 field_name = r_file.fieldnames
358 writer = csv.DictWriter(w_file, fieldnames=field_name)
359 writer.writeheader()
360 # print(field_name)
361 # writer.writerow(field_name)
362 # loop through csv list
363 for row in r_file:
364 # if current rows 1nd value is equal to input, print that row
365 if device_name == row['Model Name']:
366 a = a + 1
367 # print(row)
368 # writer.writeheader
369 # trans_buff.append(row)
370 # num_row_ele = len(row)
371 writer.writerow(row)
372 # print(num_row_ele)
373 # print(len(trans_buff))
374 # for i in range(num_row_ele):
375 # for j in len()
376 # writer.writerow(trans_buff[j] for j in len(trans_buff))
377 # writer.writerow()
378
379 # writer.writerow(trans_buff)
380 print(str(a) + ' device(s) has been found')
381
382
383def filter_file(filename):
384 # data = {}
385 field_name = {'Model Name', 'Announced', 'Dimensions', 'Weight', 'Build', 'Size', 'Resolution', 'Internal',
386 'Single', 'Chipset', 'GPU', '_1_1'}
387 # r_file = csv.DictReader(open(filename, "r"), delimiter=",")
388 df = pd.read_csv(filename, usecols=field_name)
389 df = df.transpose()
390 df.to_csv('Filter_Result.csv', encoding='utf-8', header=False)
391 # with open('Filter_Result.csv', 'w') as w_file:
392 # writer = csv.writer(w_file)
393 # writer.writerow(field_name)
394 # for row in df:
395 # # for i in field_name:
396 # # print(i)
397 # # print(row[i])
398 # # data.update(row[i])
399 # # # print(r_file.fieldnames)
400 # # # print(writer.fieldnames)
401 # # # print(row)
402 # #print([row['Model Name'], row['Announced'], row['Dimensions']])
403 # writer.writerow(row)
404 print('Filtering finished')
405
406
407def filter_csv():
408 filter_option = input(
409 "Enter 1 or 2 to choose modes: \n 1. Filter database file. \n 2. Filter Search_Result.csv. \n")
410 if filter_option == '1':
411 # input csv name you want to search
412 brand = input('Enter brand you want to search\n')
413 file_name = './GSMArenaDataset/' + brand + '.csv'
414 # read csv, and split on "," the line
415 filter_file(file_name)
416 if filter_option == '2':
417 file_name = './Search_Result.csv'
418 filter_file(file_name)
419
420
421# This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
422# Main function can switch based on user input
423# i = 1
424# try:
425# while i == 1:
426# if __name__ == "__main__":
427# obj = Gsmarena()
428# obj.save_specification_to_file()
429# except KeyboardInterrupt:
430# print("File has been stopped due to KeyBoard Interruption.")
431
432# main function
433def main():
434 user_option = input(
435 "Enter 1 or 2 to choose modes: \n 1. Output web crawler data to csv files. \n 2. Search for devices in existing csv files. \n 3. Filter key specs and transpose a csv file. \n")
436 if user_option == '1':
437 output_csv()
438 elif user_option == '2':
439 search_csv()
440 elif user_option == '3':
441 filter_csv()
442
443
444while True:
445 main()