GZSL7gVV

· 5 years ago · Sep 27, 2020, 04:00 PM
1import requests
2from bs4 import BeautifulSoup
3import csv
4import os
5import time
6import json
7import random
8import sys
9import pandas as pd
10
11import requests
12from bs4 import BeautifulSoup
13from fake_useragent import UserAgent
14from random import shuffle
15
16try:
17
18    import requests
19    from bs4 import BeautifulSoup
20    import random
21
22except:
23    print(" Library Not Found !")
24
25list_proxies = [
26                "103.250.68.10:8080",
27                "178.33.251.230:3129",
28                "88.99.149.188:31288",
29                "51.75.147.40:3128",
30                "125.27.251.124:45861",
31                "51.254.237.77:3129",
32                "104.248.63.49:31583",
33                "125.26.99.185:36525",
34                "183.164.227.165:4216",
35                "180.109.124.30:4216",
36                "103.31.251.18:8080",
37                "110.44.133.135:3128",
38                "175.100.5.52:32721",
39                "182.72.150.242:8080",
40                "51.75.147.44:3128",
41                "78.96.125.24:3128",
42                "176.56.107.214:52184",
43                "125.26.99.186:41358",
44                "217.172.170.116:3838",
45                "62.210.177.105:3128",
46                "46.225.241.66:3128",
47                "180.211.183.178:60604",
48                "116.212.129.58:59557",
49                "189.195.162.242:8080",
50                "165.22.64.68:33874",
51                "186.226.172.165:57783",
52                "43.248.24.157:51166",
53                "78.96.125.24:3128",
54                "1.20.102.102:38816",
55                "118.174.220.11:60148",
56                "195.154.232.38:3838",
57                "5.202.188.154:3128",
58                "46.151.108.6:41171",
59                "119.82.252.29:46872",
60                "117.6.161.118:53281",
61                "104.248.63.49:31583",
62                "1.20.103.196:42792",
63                "182.72.150.242:8080",
64                "108.163.66.164:8080",
65                "103.117.195.224:8686"
66]
67
68_headers = {
69    'Accept-Encoding': 'gzip, deflate, sdch',
70    'Accept-Language': 'en-US,en;q=0.8',
71    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
72    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
73    'Referer': 'http://www.wikipedia.org/',
74    'Connection': 'keep-alive',
75}
76
77def get_proxy_function():
78    return {"https": list_proxies[0]}
79
80def delete_proxy():
81    if len(list_proxies) > 0:
82        list_proxies.remove(list_proxies[0])
83
84def Proxy_Request(request_type='get', url='', **kwargs):
85    """
86
87    :param request_type: GET, POST, PUT
88    :param url: URL from which you want to do webscrapping
89    :param kwargs: any other parameter you pass
90    :return: Return Response
91    """
92    while True:
93        try:
94            proxy = get_proxy_function()
95            print("Using Proxy {}".format(proxy))
96            r = requests.request(request_type, url, proxies=proxy, headers=_headers, timeout=8, **kwargs)
97            return r
98            break
99        except:
100            pass
101
102
103# class Random_Proxy(object):
104#
105#     def __init__(self):
106#         self.__url = 'https://www.sslproxies.org/'
107#         self.__headers = {
108#             'Accept-Encoding': 'gzip, deflate, sdch',
109#             'Accept-Language': 'en-US,en;q=0.8',
110#             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
111#             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
112#             'Referer': 'http://www.wikipedia.org/',
113#             'Connection': 'keep-alive',
114#         }
115#         self.random_ip = []
116#         self.random_port = []
117#
118#     def __random_proxy(self):
119#
120#         """
121#         This is Private Function Client Should not have accesss
122#         :return: Dictionary object of Random proxy and port number
123#         """
124#
125#         r = requests.get(url=self.__url, headers=self.__headers)
126#         soup = BeautifulSoup(r.text, 'html.parser')
127#
128#         # Get the Random IP Address
129#         for x in soup.findAll('td')[::8]:
130#             self.random_ip.append(x.get_text())
131#
132#         # Get Their Port
133#         for y in soup.findAll('td')[1::8]:
134#             self.random_port.append(y.get_text())
135#
136#         # Zip together
137#         z = list(zip(self.random_ip, self.random_port))
138#
139#         # This will Fetch Random IP Address and corresponding PORT Number
140#         number = random.randint(0, len(z) - 50)
141#         ip_random = z[number]
142#
143#         # convert Tuple into String and formart IP and PORT Address
144#         ip_random_string = "{}:{}".format(ip_random[0], ip_random[1])
145#
146#         # Create a Proxy
147#         proxy = {'https': ip_random_string}
148#
149#         # return Proxy
150#         return proxy
151#
152#     def Proxy_Request(self, request_type='get', url='', **kwargs):
153#         """
154#
155#         :param request_type: GET, POST, PUT
156#         :param url: URL from which you want to do webscrapping
157#         :param kwargs: any other parameter you pass
158#         :return: Return Response
159#         """
160#         while True:
161#             try:
162#                 proxy = get_proxy_function()
163#                 print("Using Proxy {}".format(proxy))
164#                 r = requests.request(request_type, url, proxies=proxy, headers=self.__headers, timeout=8, **kwargs)
165#                 return r
166#                 break
167#             except:
168#                 pass
169
170
171# Class gsmarena scrap the website phones models and its devices and save to csv file individually.
172class Gsmarena():
173
174    # Constructor to initialize common useful varibales throughout the program.
175    def __init__(self):
176        self.phones = []
177        self.features = ["Brand", "Model Name", "Model Image"]
178        self.temp1 = []
179        self.phones_brands = []
180        self.url = 'https://www.gsmarena.com/'  # GSMArena website url
181        self.new_folder_name = 'GSMArenaDataset'  # Folder name on which files going to save.
182        self.absolute_path = os.popen(
183            'cd').read().strip() + '/' + self.new_folder_name  # It create the absolute path of the GSMArenaDataset folder.
184
185    # This function crawl the html code of the requested URL.
186    def crawl_html_page(self, sub_url):
187
188        url = self.url + sub_url  # Url for html content parsing.
189
190        # Handing the connection error of the url.
191        try:
192            request_type = "get"
193
194            r = Proxy_Request(url=url, request_type=request_type)
195
196            # Set the headers
197            soup = BeautifulSoup(r.content, 'html.parser')  # It parses the html data from requested url.
198            title = soup.find('title')
199            if title == "Too Many Requests":
200                delete_proxy()
201                self.crawl_html_page(sub_url)
202            else:
203                return soup
204
205        except ConnectionError as err:
206            print("Please check your network connection and re-run the script.")
207            exit()
208
209        except Exception:
210            print("Please check your network connection and re-run the script.")
211            exit()
212
213    # This function crawl mobile phones brands and return the list of the brands.
214    def crawl_phone_brands(self):
215        phones_brands = []
216        soup = self.crawl_html_page('makers.php3')
217        table = soup.find_all('table')[0]
218        table_a = table.find_all('a')
219        for a in table_a:
220            temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
221            phones_brands.append(temp)
222        return phones_brands
223
224    # This function crawl mobile phones brands models links and return the list of the links.
225    def crawl_phones_models(self, phone_brand_link):
226        links = []
227        nav_link = []
228        soup = self.crawl_html_page(phone_brand_link)
229        nav_data = soup.find(class_='nav-pages')
230        if not nav_data:
231            nav_link.append(phone_brand_link)
232        else:
233            nav_link = nav_data.findAll('a')
234            nav_link = [link['href'] for link in nav_link]
235            nav_link.append(phone_brand_link)
236            nav_link.insert(0, nav_link.pop())
237        for link in nav_link:
238            soup = self.crawl_html_page(link)
239            data = soup.find(class_='section-body')
240            for line1 in data.findAll('a'):
241                links.append(line1['href'])
242
243        return links
244
245    # This function crawl mobile phones specification and return the list of the all devices list of single brand.
246    def crawl_phones_models_specification(self, link, phone_brand):
247        phone_data = {}
248        try:
249            soup = self.crawl_html_page(link)
250            model_name = soup.find(class_='specs-phone-name-title').text
251            model_img_html = soup.find(class_='specs-photo-main')
252            model_img = model_img_html.find('img')['src']
253            phone_data.update({"Brand": phone_brand})
254            phone_data.update({"Model Name": model_name})
255            phone_data.update({"Model Image": model_img})
256            temp = []
257            for data1 in range(len(soup.findAll('table'))):
258                table = soup.findAll('table')[data1]
259                for line in table.findAll('tr'):
260                    temp = []
261                    for l in line.findAll('td'):
262                        text = l.getText()
263                        text = text.strip()
264                        text = text.lstrip()
265                        text = text.rstrip()
266                        text = text.replace("\n", "")
267                        temp.append(text)
268                        if temp[0] in phone_data.keys():
269                            temp[0] = temp[0] + '_1'
270                        if temp[0] not in self.features:
271                            self.features.append(temp[0])
272                    if not temp:
273                        continue
274                    else:
275                        phone_data.update({temp[0]: temp[1]})
276        except:
277            print("exception tamu ima")
278            return phone_data
279
280        return phone_data
281
282    # This function create the folder 'GSMArenaDataset'.
283    def create_folder(self):
284        if not os.path.exists(self.new_folder_name):
285            os.system('mkdir ' + self.new_folder_name)
286            print("Creating ", self.new_folder_name, " Folder....")
287            time.sleep(6)
288            print("Folder Created.")
289        else:
290            print(self.new_folder_name, "directory already exists")
291
292    # This function check the csv file exists in the 'GSMArenaDataset' directory or not.
293    def check_file_exists(self):
294        return os.listdir(self.absolute_path)
295
296    # This function save the devices specification to csv file.
297    def save_specification_to_file(self):
298        phone_brand = self.crawl_phone_brands()
299        self.create_folder()
300        files_list = self.check_file_exists()
301        for brand in phone_brand:
302            phones_data = []
303            if (brand[0].title() + '.csv') not in files_list:
304                link = self.crawl_phones_models(brand[2])
305                model_value = 1
306                print("Working on", brand[0].title(), "brand.")
307                for value in link:
308                    datum = self.crawl_phones_models_specification(value, brand[0])
309                    datum = {k: v.replace('\n', ' ').replace('\r', ' ') for k, v in datum.items()}
310                    phones_data.append(datum)
311                    print("Completed ", model_value, "/", len(link))
312                    model_value += 1
313                with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w", encoding='utf8')  as file:
314                    dict_writer = csv.DictWriter(file, fieldnames=self.features)
315                    dict_writer.writeheader()
316                    str_phones_data = json.dumps(phones_data)
317                    encoded = str_phones_data.encode('utf-8')
318                    load_list = json.loads(encoded)
319                    for dicti in load_list:
320                        dict_writer.writerow({k: v for k, v in dicti.items()})
321                print("Data loaded in the file")
322            else:
323                print(brand[0].title() + '.csv file already in your directory.')
324                if (brand == phone_brand[-1]):
325                    print('Crawling Data finished. Stored in the Dataset Folder. \n')
326                    exit()
327
328
329i = 1
330
331
332def output_csv():
333    try:
334        while i == 1:
335            if __name__ == "__main__":
336                obj = Gsmarena()
337                obj.save_specification_to_file()
338    except KeyboardInterrupt:
339        print("File has been stopped due to KeyBoard Interruption.")
340
341
342def search_csv():
343    # input csv name you want to search
344    brand = input('Enter brand you want to search\n')
345    file_name = './GSMArenaDataset/' + brand + '.csv'
346    # read csv, and split on "," the line
347    r_file = csv.DictReader(open(file_name, "r", encoding='utf8'), delimiter=",")
348
349    device_name = brand + ' ' + input('Enter Device name\n')
350    print('searching for device named ' + device_name)
351
352    a = 0
353
354    with open('Search_Result.csv', 'w', encoding='utf8') as w_file:
355        # r_file = csv.Reader(r)
356        # trans_buff = []
357        field_name = r_file.fieldnames
358        writer = csv.DictWriter(w_file, fieldnames=field_name)
359        writer.writeheader()
360        # print(field_name)
361        # writer.writerow(field_name)
362        # loop through csv list
363        for row in r_file:
364            # if current rows 1nd value is equal to input, print that row
365            if device_name == row['Model Name']:
366                a = a + 1
367                # print(row)
368                # writer.writeheader
369                # trans_buff.append(row)
370                # num_row_ele = len(row)
371                writer.writerow(row)
372        # print(num_row_ele)
373        # print(len(trans_buff))
374        # for i in range(num_row_ele):
375        # for j in len()
376        # writer.writerow(trans_buff[j] for j in len(trans_buff))
377        # writer.writerow()
378
379        # writer.writerow(trans_buff)
380    print(str(a) + ' device(s) has been found')
381
382
383def filter_file(filename):
384    # data = {}
385    field_name = {'Model Name', 'Announced', 'Dimensions', 'Weight', 'Build', 'Size', 'Resolution', 'Internal',
386                  'Single', 'Chipset', 'GPU', '_1_1'}
387    # r_file = csv.DictReader(open(filename, "r"), delimiter=",")
388    df = pd.read_csv(filename, usecols=field_name)
389    df = df.transpose()
390    df.to_csv('Filter_Result.csv', encoding='utf-8', header=False)
391    # with open('Filter_Result.csv', 'w') as w_file:
392    #     writer = csv.writer(w_file)
393    #     writer.writerow(field_name)
394    #     for row in df:
395    #         # for i in field_name:
396    #         #     print(i)
397    #         #     print(row[i])
398    #         #     data.update(row[i])
399    #         #     # print(r_file.fieldnames)
400    #         #     # print(writer.fieldnames)
401    #         #     # print(row)
402    #         #print([row['Model Name'], row['Announced'], row['Dimensions']])
403    #         writer.writerow(row)
404    print('Filtering finished')
405
406
407def filter_csv():
408    filter_option = input(
409        "Enter 1 or 2 to choose modes: \n 1. Filter database file. \n 2. Filter Search_Result.csv. \n")
410    if filter_option == '1':
411        # input csv name you want to search
412        brand = input('Enter brand you want to search\n')
413        file_name = './GSMArenaDataset/' + brand + '.csv'
414        # read csv, and split on "," the line
415        filter_file(file_name)
416    if filter_option == '2':
417        file_name = './Search_Result.csv'
418        filter_file(file_name)
419
420
421# This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
422# Main function can switch based on user input
423# i = 1
424# try:
425#     while i == 1:
426#         if __name__ == "__main__":
427#             obj = Gsmarena()
428#             obj.save_specification_to_file()
429# except KeyboardInterrupt:
430#     print("File has been stopped due to KeyBoard Interruption.")
431
432# main function
433def main():
434    user_option = input(
435        "Enter 1 or 2 to choose modes: \n 1. Output web crawler data to csv files. \n 2. Search for devices in existing csv files. \n 3. Filter key specs and transpose a csv file. \n")
436    if user_option == '1':
437        output_csv()
438    elif user_option == '2':
439        search_csv()
440    elif user_option == '3':
441        filter_csv()
442
443
444while True:
445    main()