fJdtaCzN

· 5 years ago · Sep 19, 2020, 10:48 PM
1import requests
2from bs4 import BeautifulSoup
3import csv
4import os
5import time
6import json
7import random
8import sys
9import pandas as pd
10
11import requests
12from bs4 import BeautifulSoup
13from fake_useragent import UserAgent
14from random import shuffle
15
16try:
17
18    import requests
19    from bs4 import BeautifulSoup
20    import random
21
22except:
23    print(" Library Not Found !")
24
25def function_random():
26    list_a = ["51.15.166.107:3128",
27                "188.68.56.248:3128",
28                "177.202.43.110:60196",
29                "162.144.35.146:3838",
30                "178.252.166.210:8080",
31                "83.97.23.90:18080",
32                "50.246.120.125:8080",
33                ]
34    return {"https": random.choice(list_a)}
35
36
37class Random_Proxy(object):
38
39    def __init__(self):
40        self.__url = 'https://www.sslproxies.org/'
41        self.__headers = {
42            'Accept-Encoding': 'gzip, deflate, sdch',
43            'Accept-Language': 'en-US,en;q=0.8',
44            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
45            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
46            'Referer': 'http://www.wikipedia.org/',
47            'Connection': 'keep-alive',
48            }
49        self.random_ip = []
50        self.random_port = []
51
52    def __random_proxy(self):
53
54        """
55        This is Private Function Client Should not have accesss
56        :return: Dictionary object of Random proxy and port number
57        """
58
59        r = requests.get(url=self.__url, headers=self.__headers)
60        soup = BeautifulSoup(r.text, 'html.parser')
61
62        # Get the Random IP Address
63        for x in soup.findAll('td')[::8]:
64            self.random_ip.append(x.get_text())
65
66        # Get Their Port
67        for y in soup.findAll('td')[1::8]:
68            self.random_port.append(y.get_text())
69
70        # Zip together
71        z = list(zip(self.random_ip, self.random_port))
72
73        # This will Fetch Random IP Address and corresponding PORT Number
74        number = random.randint(0, len(z)-50)
75        ip_random = z[number]
76
77        # convert Tuple into String and formart IP and PORT Address
78        ip_random_string = "{}:{}".format(ip_random[0],ip_random[1])
79
80        # Create a Proxy
81        proxy = {'https':ip_random_string}
82
83        # return Proxy
84        return proxy
85
86    def Proxy_Request(self,request_type='get',url='',**kwargs):
87        """
88
89        :param request_type: GET, POST, PUT
90        :param url: URL from which you want to do webscrapping
91        :param kwargs: any other parameter you pass
92        :return: Return Response
93        """
94        while True:
95            try:
96                proxy = self.__random_proxy()
97                print("Using Proxy {}".format(proxy))
98                r = requests.request(request_type,url,proxies=proxy,headers=self.__headers ,timeout=8, **kwargs)
99                return r
100                break
101            except:
102                pass
103
104
105# Class gsmarena scrap the website phones models and its devices and save to csv file individually.
106class Gsmarena():
107
108    # Constructor to initialize common useful varibales throughout the program.
109    def __init__(self):
110        self.phones = []
111        self.features = ["Brand", "Model Name", "Model Image"]
112        self.temp1 = []
113        self.phones_brands = []
114        self.url = 'https://www.gsmarena.com/'  # GSMArena website url
115        self.new_folder_name = 'GSMArenaDataset'  # Folder name on which files going to save.
116        self.absolute_path = os.popen('cd').read().strip() + '/' + self.new_folder_name  # It create the absolute path of the GSMArenaDataset folder.
117
118    # This function crawl the html code of the requested URL.
119    def crawl_html_page(self, sub_url):
120
121        url = self.url + sub_url  # Url for html content parsing.
122
123        # Handing the connection error of the url.
124        try:
125            proxy = Random_Proxy()
126            request_type = "get"
127
128            r = proxy.Proxy_Request(url=url, request_type=request_type)
129            # Set the headers
130            soup = BeautifulSoup(r.content, 'html.parser')  # It parses the html data from requested url.
131            return soup
132
133        except ConnectionError as err:
134            print("Please check your network connection and re-run the script.")
135            exit()
136
137        except Exception:
138            print("Please check your network connection and re-run the script.")
139            exit()
140
141    # This function crawl mobile phones brands and return the list of the brands.
142    def crawl_phone_brands(self):
143        phones_brands = []
144        soup = self.crawl_html_page('makers.php3')
145        table = soup.find_all('table')[0]
146        table_a = table.find_all('a')
147        for a in table_a:
148            temp = [a['href'].split('-')[0], a.find('span').text.split(' ')[0], a['href']]
149            phones_brands.append(temp)
150        return phones_brands
151
152    # This function crawl mobile phones brands models links and return the list of the links.
153    def crawl_phones_models(self, phone_brand_link):
154        links = []
155        nav_link = []
156        soup = self.crawl_html_page(phone_brand_link)
157        nav_data = soup.find(class_='nav-pages')
158        if not nav_data:
159            nav_link.append(phone_brand_link)
160        else:
161            nav_link = nav_data.findAll('a')
162            nav_link = [link['href'] for link in nav_link]
163            nav_link.append(phone_brand_link)
164            nav_link.insert(0, nav_link.pop())
165        for link in nav_link:
166            soup = self.crawl_html_page(link)
167            data = soup.find(class_='section-body')
168            for line1 in data.findAll('a'):
169                links.append(line1['href'])
170
171        return links
172
173    # This function crawl mobile phones specification and return the list of the all devices list of single brand.
174    def crawl_phones_models_specification(self, link, phone_brand):
175        phone_data = {}
176        soup = self.crawl_html_page(link)
177        model_name = soup.find(class_='specs-phone-name-title').text
178        model_img_html = soup.find(class_='specs-photo-main')
179        model_img = model_img_html.find('img')['src']
180        phone_data.update({"Brand": phone_brand})
181        phone_data.update({"Model Name": model_name})
182        phone_data.update({"Model Image": model_img})
183        temp = []
184        for data1 in range(len(soup.findAll('table'))):
185            table = soup.findAll('table')[data1]
186            for line in table.findAll('tr'):
187                temp = []
188                for l in line.findAll('td'):
189                    text = l.getText()
190                    text = text.strip()
191                    text = text.lstrip()
192                    text = text.rstrip()
193                    text = text.replace("\n", "")
194                    temp.append(text)
195                    if temp[0] in phone_data.keys():
196                        temp[0] = temp[0] + '_1'
197                    if temp[0] not in self.features:
198                        self.features.append(temp[0])
199                if not temp:
200                    continue
201                else:
202                    phone_data.update({temp[0]: temp[1]})
203        return phone_data
204
205    # This function create the folder 'GSMArenaDataset'.
206    def create_folder(self):
207        if not os.path.exists(self.new_folder_name):
208            os.system('mkdir ' + self.new_folder_name)
209            print("Creating ", self.new_folder_name, " Folder....")
210            time.sleep(6)
211            print("Folder Created.")
212        else:
213            print(self.new_folder_name, "directory already exists")
214
215    # This function check the csv file exists in the 'GSMArenaDataset' directory or not.
216    def check_file_exists(self):
217        return os.listdir(self.absolute_path)
218
219    # This function save the devices specification to csv file.
220    def save_specification_to_file(self):
221        phone_brand = self.crawl_phone_brands()
222        self.create_folder()
223        files_list = self.check_file_exists()
224        for brand in phone_brand:
225            phones_data = []
226            if (brand[0].title() + '.csv') not in files_list:
227                link = self.crawl_phones_models(brand[2])
228                model_value = 1
229                print("Working on", brand[0].title(), "brand.")
230                for value in link:
231                    datum = self.crawl_phones_models_specification(value, brand[0])
232                    datum = {k: v.replace('\n', ' ').replace('\r', ' ') for k, v in datum.items()}
233                    phones_data.append(datum)
234                    print("Completed ", model_value, "/", len(link))
235                    model_value += 1
236                with open(self.absolute_path + '/' + brand[0].title() + ".csv", "w", encoding='utf8')  as file:
237                    dict_writer = csv.DictWriter(file, fieldnames=self.features)
238                    dict_writer.writeheader()
239                    str_phones_data = json.dumps(phones_data)
240                    encoded = str_phones_data.encode('utf-8')
241                    load_list = json.loads(encoded)
242                    for dicti in load_list:
243                        dict_writer.writerow({k: v for k, v in dicti.items()})
244                print("Data loaded in the file")
245            else:
246                print(brand[0].title() + '.csv file already in your directory.')
247                if (brand == phone_brand[-1]):
248                    print('Crawling Data finished. Stored in the Dataset Folder. \n')
249                    exit()
250
251
252i = 1
253
254
255def output_csv():
256    try:
257        while i == 1:
258            if __name__ == "__main__":
259                obj = Gsmarena()
260                obj.save_specification_to_file()
261    except KeyboardInterrupt:
262        print("File has been stopped due to KeyBoard Interruption.")
263
264
265def search_csv():
266    # input csv name you want to search
267    brand = input('Enter brand you want to search\n')
268    file_name = './GSMArenaDataset/' + brand + '.csv'
269    # read csv, and split on "," the line
270    r_file = csv.DictReader(open(file_name, "r", encoding='utf8'), delimiter=",")
271
272    device_name = brand + ' ' + input('Enter Device name\n')
273    print('searching for device named ' + device_name)
274
275    a = 0
276
277    with open('Search_Result.csv', 'w', encoding='utf8') as w_file:
278        # r_file = csv.Reader(r)
279        # trans_buff = []
280        field_name = r_file.fieldnames
281        writer = csv.DictWriter(w_file, fieldnames=field_name)
282        writer.writeheader()
283        # print(field_name)
284        # writer.writerow(field_name)
285        # loop through csv list
286        for row in r_file:
287            # if current rows 1nd value is equal to input, print that row
288            if device_name == row['Model Name']:
289                a = a + 1
290                # print(row)
291                # writer.writeheader
292                # trans_buff.append(row)
293                # num_row_ele = len(row)
294                writer.writerow(row)
295        # print(num_row_ele)
296        # print(len(trans_buff))
297        # for i in range(num_row_ele):
298        # for j in len()
299        # writer.writerow(trans_buff[j] for j in len(trans_buff))
300        # writer.writerow()
301
302        # writer.writerow(trans_buff)
303    print(str(a) + ' device(s) has been found')
304
305
306def filter_file(filename):
307    # data = {}
308    field_name = {'Model Name', 'Announced', 'Dimensions', 'Weight', 'Build', 'Size', 'Resolution', 'Internal',
309                  'Single', 'Chipset', 'GPU', '_1_1'}
310    # r_file = csv.DictReader(open(filename, "r"), delimiter=",")
311    df = pd.read_csv(filename, usecols=field_name)
312    df = df.transpose()
313    df.to_csv('Filter_Result.csv', encoding='utf-8', header=False)
314    # with open('Filter_Result.csv', 'w') as w_file:
315    #     writer = csv.writer(w_file)
316    #     writer.writerow(field_name)
317    #     for row in df:
318    #         # for i in field_name:
319    #         #     print(i)
320    #         #     print(row[i])
321    #         #     data.update(row[i])
322    #         #     # print(r_file.fieldnames)
323    #         #     # print(writer.fieldnames)
324    #         #     # print(row)
325    #         #print([row['Model Name'], row['Announced'], row['Dimensions']])
326    #         writer.writerow(row)
327    print('Filtering finished')
328
329
330def filter_csv():
331    filter_option = input(
332        "Enter 1 or 2 to choose modes: \n 1. Filter database file. \n 2. Filter Search_Result.csv. \n")
333    if filter_option == '1':
334        # input csv name you want to search
335        brand = input('Enter brand you want to search\n')
336        file_name = './GSMArenaDataset/' + brand + '.csv'
337        # read csv, and split on "," the line
338        filter_file(file_name)
339    if filter_option == '2':
340        file_name = './Search_Result.csv'
341        filter_file(file_name)
342
343
344# This is the main function which create the object of Gsmarena class and call the save_specificiton_to_file function.
345# Main function can switch based on user input
346# i = 1
347# try:
348#     while i == 1:
349#         if __name__ == "__main__":
350#             obj = Gsmarena()
351#             obj.save_specification_to_file()
352# except KeyboardInterrupt:
353#     print("File has been stopped due to KeyBoard Interruption.")
354
355# main function
356def main():
357    user_option = input("Enter 1 or 2 to choose modes: \n 1. Output web crawler data to csv files. \n 2. Search for devices in existing csv files. \n 3. Filter key specs and transpose a csv file. \n")
358    if user_option == '1':
359        output_csv()
360    elif user_option == '2':
361        search_csv()
362    elif user_option == '3':
363        filter_csv()
364
365
366while True:
367    main()