· 6 years ago · May 19, 2019, 06:50 PM
1# -*- coding: utf-8 -*-
2from lxml import html
3import MySQLdb
4import requests
5import json
6import time
7from datetime import datetime
8
9proxyDict = {
10 "https" : "62.138.3.135:3128",
11 }
12f = open("/home/Svotin/django/scripts/proxy.txt", "r")
13proxyList = f.read().split('\n')
14f.close()
15# 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/cpu/',
16# 'https://www.citilink.ru/catalog/computers_and_notebooks/hdd/ssd_in/',
17# 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/coolers/',
18# 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/motherboards/',
19# 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/memory/',
20
21 # 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/videocards/',
22 # 'https://www.citilink.ru/catalog/computers_and_notebooks/hdd/hdd_in/',
23 # 'https://www.citilink.ru/catalog/computers_and_notebooks/parts/powersupply/',
24def url_scrapping():
25 urls = [
26
27 "https://www.citilink.ru/catalog/mobile/notebooks/",
28 ]
29 for url in urls:
30 userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'
31 headers = {
32 'User-Agent': userAgent
33 }
34 r = requests.get(url, headers = headers)
35 tree = html.fromstring(r.text)
36 get_products(tree)
37 skipped = 0
38 for i in range(2,1000):
39 page = i - skipped
40 newUrl = url + "?p=" + str(page)
41 r = 0
42 if skipped > 0:
43 proxyDict = {
44 "https" : proxyList[skipped],
45 }
46 r = requests.get(newUrl, headers = headers, proxies=proxyDict)
47 else:
48 r = requests.get(newUrl, headers = headers)
49 tree = html.fromstring(r.text)
50 pageTitle = tree.findtext('.//title')
51 print(str(i) + " " + tree.findtext('.//title') + " " + newUrl)
52 if pageTitle.find("не найдена") > 1:
53 break
54 if pageTitle == "Ситилинк":
55 skipped += 1
56 time.sleep(3)
57 continue
58 get_products(tree)
59
60def fill_db(title, price, link, image, category, vendor):
61 db = MySQLdb.connect(host="svotin.mysql.pythonanywhere-services.com",user="Svotin",
62 passwd="Qwerty2102",db="Svotin$mvideo_ru", use_unicode=True, charset="utf8")
63 cursor = db.cursor()
64 price_digit = float(price)
65 cursor.execute('''
66 create table IF NOT EXISTS citilink (id INT AUTO_INCREMENT NOT NULL,
67 name char(70) default 'citilink' not null ,
68 price int(20) unsigned default '0' not null,
69 currency char(10) default '$' not null,
70 link char(100) not null,
71 link_to_image char(120) not null,
72 category char(100) not null,
73 vendor char(100) not null,
74 primary key (id)
75 );
76 ''')
77 cursor.execute("SELECT * FROM citilink WHERE name='{}'".format(title))
78 if cursor.rowcount < 1:
79 cursor.execute('''
80 INSERT INTO citilink (name,price,currency, link, link_to_image, category, vendor) VALUES
81 ('{}',{},'RUB','{}', '{}', '{}', '{}')'''.format(title, price_digit, link, image, category, vendor))
82 elif cursor.rowcount == 1:
83 add_to_history(cursor, "citilink", cursor.fetchall()[0][0], datetime.today().strftime('%Y.%m.%d'), cursor.fetchall()[0][2], price_digit)
84 cursor.execute("UPDATE citilink SET price = {} WHERE name='{}';".format(price_digit ,title))
85 db.commit()
86 db.close()
87
88def add_to_history(cursor, market, id, date, old_sum, new_sum):
89 cursor.execute('''
90 create table IF NOT EXISTS old_prices (id INT AUTO_INCREMENT NOT NULL,
91 market char(70) default 'mvideo' not null ,
92 market_id int(20) unsigned default '0' not null,
93 date char(14) default '10.1.1990' not null,
94 old_sum int(20) not null,
95 new_sum int(20) not null,
96 primary key (id)
97 );
98 ''')
99 cursor.execute('''
100 INSERT INTO old_prices (market, market_id, date, old_sum, new_sum) VALUES
101 ('{}',{},'{}',{}, {})'''.format(market, id, date, old_sum, new_sum))
102
103def get_products(tree):
104 nodes = tree.xpath('''
105 //div[@class="block_data__gtm-js block_data__pageevents-js listing_block_data__pageevents-js"]/div[@data-list-id="main"]
106 ''')
107 for i in nodes:
108 try:
109 info = i.xpath('''
110 .//@data-params
111 ''')[0]
112 atr = (i.xpath('''
113 .//div/div/div[@class="wrap-img"]/a/img
114 ''')[0].attrib)
115 image = 0
116 try:
117 image = atr['data-src']
118 except:
119 image = atr['src']
120 link_to_product = i.xpath('''
121 .//div/div/div[@class="wrap-img"]/a/@href
122 ''')[0]
123 name = atr["alt"]
124 jsonInfo = json.loads(info)
125 price = jsonInfo['price']
126 vendor = jsonInfo["brandName"]
127 category = jsonInfo['categoryName']
128 desc = i.xpath('''
129 .//div[@class="product_name cms_item_panel subcategory-product-item__info"]/p[@class="short_description"]
130 ''')[0].text.replace(" ", '').replace(' —',"").replace('''
131 ''', '').split(';')[1]
132 if desc == "":
133 desc = ''
134 else:
135 desc = desc[0].upper() + desc[1:]
136 fill_db(name, price, link_to_product, image, category, vendor)
137 except:
138 continue
139## print(name, price, image, vendor, category, link_to_product, desc)
140
141url_scrapping()