· 5 years ago · Sep 19, 2020, 10:48 PM
1import requests
2from bs4 import BeautifulSoup
3import csv
4import os
5import time
6import json
7import random
8import sys
9import pandas as pd
10
11import requests
12from bs4 import BeautifulSoup
13from fake_useragent import UserAgent
14from random import shuffle
15
16try:
17
18 import requests
19 from bs4 import BeautifulSoup
20 import random
21
22except:
23 print(" Library Not Found !")
24
25def function_random():
26 list_a = ["51.15.166.107:3128",
27 "188.68.56.248:3128",
28 "177.202.43.110:60196",
29 "162.144.35.146:3838",
30 "178.252.166.210:8080",
31 "83.97.23.90:18080",
32 "50.246.120.125:8080",
33 ]
34 return {"https": random.choice(list_a)}
35
36
37class Random_Proxy(object):
38
39 def __init__(self):
40 self.__url = 'https://www.sslproxies.org/'
41 self.__headers = {
42 'Accept-Encoding': 'gzip, deflate, sdch',
43 'Accept-Language': 'en-US,en;q=0.8',
44 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
45 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
46 'Referer': 'http://www.wikipedia.org/',
47 'Connection': 'keep-alive',
48 }
49 self.random_ip = []
50 self.random_port = []
51
52 def __random_proxy(self):
53
54 """
55 This is Private Function Client Should not have accesss
56 :return: Dictionary object of Random proxy and port number
57 """
58
59 r = requests.get(url=self.__url, headers=self.__headers)
60 soup = BeautifulSoup(r.text, 'html.parser')
61
62 # Get the Random IP Address
63 for x in soup.findAll('td')[::8]:
64 self.random_ip.append(x.get_text())
65
66 # Get Their Port
67 for y in soup.findAll('td')[1::8]:
68 self.random_port.append(y.get_text())
69
70 # Zip together
71 z = list(zip(self.random_ip, self.random_port))
72
73 # This will Fetch Random IP Address and corresponding PORT Number
74 number = random.randint(0, len(z)-50)
75 ip_random = z[number]
76
77 # convert Tuple into String and formart IP and PORT Address
78 ip_random_string = "{}:{}".format(ip_random[0],ip_random[1])
79
80 # Create a Proxy
81 proxy = {'https':ip_random_string}
82
83 # return Proxy
84 return proxy
85
86 def Proxy_Request(self,request_type='get',url='',**kwargs):
87 """
88
89 :param request_type: GET, POST, PUT
90 :param url: URL from which you want to do webscrapping
91 :param kwargs: any other parameter you pass
92 :return: Return Response
93 """
94 while True:
95 try:
96 proxy = self.__random_proxy()
97 print("Using Proxy {}".format(proxy))
98 r = requests.request(request_type,url,proxies=proxy,headers=self.__headers ,timeout=8, **kwargs)
99 return r
100 break
101 except:
102 pass
103
104
105# Class gsmarena scrap the website phones models and its devices and save to csv file individually.
106class Gsmarena():
107
108 # Constructor to initialize common useful varibales throughout the program.
109 def __init__(self):
110 self.phones = []
111 self.features = ["Brand", "Model Name", "Model Image"]
112 self.temp1 = []
113 self.phones_brands = []
114 self.url = 'https://www.gsmarena.com/' # GSMArena website url
115 self.new_folder_name = 'GSMArenaDataset' # Folder name on which files going to save.
116 self.absolute_path = os.popen('cd').read().strip() + '/' + self.new_folder_name # It create the absolute path of the GSMArenaDataset folder.
117
118 # This function crawl the html code of the requested URL.
119 def crawl_html_page(self, sub_url):
120
121 url = self.url + sub_url # Url for html content parsing.
122
123 # Handing the connection error of the url.
124 try:
125 proxy = Random_Proxy()
126 request_type = "get"
127
128 r = proxy.Proxy_Request(url=url, request_type=request_type)
129 # Set the headers
130 soup = BeautifulSoup(r.content, 'html.parser') # It parses the html data from requested url.
131 return soup
132
133 except ConnectionError as err:
134 print("Please check your network connection and re-run the script.")
135 exit()
136
137 except Exception:
138 print("Please check your network connection and re-run the script.")
139 exit()
140
141 # This function crawl mobile phones brands and return the list of the brands.
142 def crawl_phone_brands(self):
143 phones_brands = []
144 soup = self.crawl_html_page('makers.php3')
145 table = soup.find_all('table')[0]
146 table_a = table.find_all('a')
147 for a in table_a:
148 temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
149 phones_brands.append(temp)
150 return phones_brands
151
152 # This function crawl mobile phones brands models links and return the list of the links.
153 def crawl_phones_models(self, phone_brand_link):
154 links = []
155 nav_link = []
156 soup = self.crawl_html_page(phone_brand_link)
157 nav_data = soup.find(class_='nav-pages')
158 if not nav_data:
159 nav_link.append(phone_brand_link)
160 else:
161 nav_link = nav_data.findAll('a')
162 nav_link = [link['href'] for link in nav_link]
163 nav_link.append(phone_brand_link)
164 nav_link.insert(0, nav_link.pop())
165 for link in nav_link:
166 soup = self.crawl_html_page(link)
167 data = soup.find(class_='section-body')
168 for line1 in data.findAll('a'):
169 links.append(line1['href'])
170
171 return links
172
173 # This function crawl mobile phones specification and return the list of the all devices list of single brand.
174 def crawl_phones_models_specification(self, link, phone_brand):
175 phone_data = {}
176 soup = self.crawl_html_page(link)
177 model_name = soup.find(class_='specs-phone-name-title').text
178 model_img_html = soup.find(class_='specs-photo-main')
179 model_img = model_img_html.find('img')['src']
180 phone_data.update({"Brand": phone_brand})
181 phone_data.update({"Model Name": model_name})
182 phone_data.update({"Model Image": model_img})
183 temp = []
184 for data1 in range(len(soup.findAll('table'))):
185 table = soup.findAll('table')[data1]
186 for line in table.findAll('tr'):
187 temp = []
188 for l in line.findAll('td'):
189 text = l.getText()
190 text = text.strip()
191 text = text.lstrip()
192 text = text.rstrip()
193 text = text.replace("\n", "")
194 temp.append(text)
195 if temp[0] in phone_data.keys():
196 temp[0] = temp[0] + '_1'
197 if temp[0] not in self.features:
198 self.features.append(temp[0])
199 if not temp:
200 continue
201 else:
202 phone_data.update({temp[0]: temp[1]})
203 return phone_data
204
205 # This function create the folder 'GSMArenaDataset'.
206 def create_folder(self):
207 if not os.path.exists(self.new_folder_name):
208 os.system('mkdir ' + self.new_folder_name)
209 print("Creating ", self.new_folder_name, " Folder....")
210 time.sleep(6)
211 print("Folder Created.")
212 else:
213 print(self.new_folder_name, "directory already exists")
214
215 # This function check the csv file exists in the 'GSMArenaDataset' directory or not.
216 def check_file_exists(self):
217 return os.listdir(self.absolute_path)
218
219 # This function save the devices specification to csv file.
220 def save_specification_to_file(self):
221 phone_brand = self.crawl_phone_brands()
222 self.create_folder()
223 files_list = self.check_file_exists()
224 for brand in phone_brand:
225 phones_data = []
226 if (brand[0].title() + '.csv') not in files_list:
227 link = self.crawl_phones_models(brand[2])
228 model_value = 1
229 print("Working on", brand[0].title(), "brand.")
230 for value in link:
231 datum = self.crawl_phones_models_specification(value, brand[0])
232 datum = {k: v.replace('\n', ' ').replace('\r', ' ') for k, v in datum.items()}
233 phones_data.append(datum)
234 print("Completed ", model_value, "/", len(link))
235 model_value += 1
236 with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w", encoding='utf8') as file:
237 dict_writer = csv.DictWriter(file, fieldnames=self.features)
238 dict_writer.writeheader()
239 str_phones_data = json.dumps(phones_data)
240 encoded = str_phones_data.encode('utf-8')
241 load_list = json.loads(encoded)
242 for dicti in load_list:
243 dict_writer.writerow({k: v for k, v in dicti.items()})
244 print("Data loaded in the file")
245 else:
246 print(brand[0].title() + '.csv file already in your directory.')
247 if (brand == phone_brand[-1]):
248 print('Crawling Data finished. Stored in the Dataset Folder. \n')
249 exit()
250
251
252i = 1
253
254
255def output_csv():
256 try:
257 while i == 1:
258 if __name__ == "__main__":
259 obj = Gsmarena()
260 obj.save_specification_to_file()
261 except KeyboardInterrupt:
262 print("File has been stopped due to KeyBoard Interruption.")
263
264
265def search_csv():
266 # input csv name you want to search
267 brand = input('Enter brand you want to search\n')
268 file_name = './GSMArenaDataset/' + brand + '.csv'
269 # read csv, and split on "," the line
270 r_file = csv.DictReader(open(file_name, "r", encoding='utf8'), delimiter=",")
271
272 device_name = brand + ' ' + input('Enter Device name\n')
273 print('searching for device named ' + device_name)
274
275 a = 0
276
277 with open('Search_Result.csv', 'w', encoding='utf8') as w_file:
278 # r_file = csv.Reader(r)
279 # trans_buff = []
280 field_name = r_file.fieldnames
281 writer = csv.DictWriter(w_file, fieldnames=field_name)
282 writer.writeheader()
283 # print(field_name)
284 # writer.writerow(field_name)
285 # loop through csv list
286 for row in r_file:
287 # if current rows 1nd value is equal to input, print that row
288 if device_name == row['Model Name']:
289 a = a + 1
290 # print(row)
291 # writer.writeheader
292 # trans_buff.append(row)
293 # num_row_ele = len(row)
294 writer.writerow(row)
295 # print(num_row_ele)
296 # print(len(trans_buff))
297 # for i in range(num_row_ele):
298 # for j in len()
299 # writer.writerow(trans_buff[j] for j in len(trans_buff))
300 # writer.writerow()
301
302 # writer.writerow(trans_buff)
303 print(str(a) + ' device(s) has been found')
304
305
306def filter_file(filename):
307 # data = {}
308 field_name = {'Model Name', 'Announced', 'Dimensions', 'Weight', 'Build', 'Size', 'Resolution', 'Internal',
309 'Single', 'Chipset', 'GPU', '_1_1'}
310 # r_file = csv.DictReader(open(filename, "r"), delimiter=",")
311 df = pd.read_csv(filename, usecols=field_name)
312 df = df.transpose()
313 df.to_csv('Filter_Result.csv', encoding='utf-8', header=False)
314 # with open('Filter_Result.csv', 'w') as w_file:
315 # writer = csv.writer(w_file)
316 # writer.writerow(field_name)
317 # for row in df:
318 # # for i in field_name:
319 # # print(i)
320 # # print(row[i])
321 # # data.update(row[i])
322 # # # print(r_file.fieldnames)
323 # # # print(writer.fieldnames)
324 # # # print(row)
325 # #print([row['Model Name'], row['Announced'], row['Dimensions']])
326 # writer.writerow(row)
327 print('Filtering finished')
328
329
330def filter_csv():
331 filter_option = input(
332 "Enter 1 or 2 to choose modes: \n 1. Filter database file. \n 2. Filter Search_Result.csv. \n")
333 if filter_option == '1':
334 # input csv name you want to search
335 brand = input('Enter brand you want to search\n')
336 file_name = './GSMArenaDataset/' + brand + '.csv'
337 # read csv, and split on "," the line
338 filter_file(file_name)
339 if filter_option == '2':
340 file_name = './Search_Result.csv'
341 filter_file(file_name)
342
343
344# This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
345# Main function can switch based on user input
346# i = 1
347# try:
348# while i == 1:
349# if __name__ == "__main__":
350# obj = Gsmarena()
351# obj.save_specification_to_file()
352# except KeyboardInterrupt:
353# print("File has been stopped due to KeyBoard Interruption.")
354
355# main function
356def main():
357 user_option = input("Enter 1 or 2 to choose modes: \n 1. Output web crawler data to csv files. \n 2. Search for devices in existing csv files. \n 3. Filter key specs and transpose a csv file. \n")
358 if user_option == '1':
359 output_csv()
360 elif user_option == '2':
361 search_csv()
362 elif user_option == '3':
363 filter_csv()
364
365
366while True:
367 main()