· 4 years ago · Apr 29, 2021, 03:08 PM
1# -*- coding: utf-8 -*-
2# https://github.com/ian-kerins/amazon-python-scrapy-scraper
3
4# cd amazon_scrapy
5# scrapy crawl amazon -o amazon_test_FR2.csv
6
7import scrapy
8from bs4 import BeautifulSoup
9from urllib.parse import urlencode, unquote
10import re
11import json
12import requests
13import time
14
15#INPUTS - update file name and country code
16
17# file = open('urls_fr.txt', 'r')
18#
19# queries = file.read()
20#
21# file.close()
22# queries = queries.split('\n')
23queries = ['https://www.amazon.co.uk/dp/B07G3Q9JWT/']
24
25row_count = sum(1 for row in queries)
26
27print('\nURL LIST HAS ' + str(row_count) + ' ROWS\n')
28
29API = '9165dfdd318c0570418a52c726f10390' ##Insert Scraperapi API key here. Signup here for free trial with 5,000 requests: https://www.scraperapi.com/signup
30
31
32# United States (us) Canada (ca), United Kingdom (uk), Germany (de), France (fr),
33# Spain (es), Brazil (br), Mexico (mx), India (in), Japan (jp), China (cn), and Australia (au)
34
35def get_url(url):
36 payload = {'api_key': API, 'url': url, 'country_code': 'fr', 'render': 'true'}
37 proxy_url = 'http://api.scraperapi.com/?'+urlencode(payload)
38 return proxy_url
39
40
41class AmazonSpider(scrapy.Spider):
42 name = 'amazon'
43
44 def start_requests(self):
45 for query in queries:
46 yield scrapy.Request(url=get_url(query), callback=self.parse_product_page)
47
48 def parse_product_page(self, response):
49 title = response.xpath('//*[@id="productTitle"]/text()').extract_first().strip()
50 price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
51
52 # question = response.xpath('a-section askTeaserQuestions a [data-action="ask-log-click-csm"]/text()').extract_first()
53 # answer = response.xpath('.askTeaserQuestions a [data-action="ask-log-click-csm"]/text()').extract_first()
54 brand = self.extract_brand(response)
55
56 description = response.xpath('//*[@id="productDescription"]/p/text()').extract_first()
57 variations = response.xpath('//div[@id="variation_size_name"]/ul/li//text()').extract()
58 variations = list(filter(lambda x: x[0] != "\n", variations))
59 images = self.extract_images(response)
60 stock_status = self.extract_stock_status(response)
61 seller = self.extract_seller(response)
62 location = response.xpath('//*[@id="glow-ingress-line2"]/text()').extract_first().strip()
63 amazon_choice = response.xpath('//*[@id="acBadge_feature_div"]//*[contains(text(), "Amazon\'s Choice")]')\
64 .extract()
65 amazon_choice = len(amazon_choice) > 0
66
67 attributes = self.extract_product_attributes(response)
68 bulletpoints = self.extract_bulletpoints(response)
69 has_video = len(response.xpath('//*[contains(@class, "videoBlockIngress")]').extract()) > 0
70 important_info = response.xpath('//div[@id="important-information"]//p/text()').extract_first()
71 climate_friendly = response.xpath('//*[@id="climatePledgeFriendly"]//p/text()').extract_first()
72 aplus = len(response.xpath('//*[contains(@class, "aplus-v2")]').extract()) > 0
73
74
75 rating = self.extract_rating(response)
76 reviews = self.extract_reviews(response)
77 reviews_terms = self.extract_reviews_terms(response)
78 product_information = self.extract_product_information(response)
79
80 if not price:
81 price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
82 response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()
83
84 qa = self.extract_qa(response)
85
86 yield {
87 'Title': title,
88 'URL': unquote(response.request.url.replace('http://api.scraperapi.com/?api_key=9165dfdd318c0570418a52c726f10390&url=', '')).split('&')[0],
89 'Price': price.strip() if type(price) == str else price,
90 'Brand': brand,
91 '2nd Images': images,
92 '2nd Image Count': len(images),
93 'Has Video': 'Yes' if has_video else 'No',
94 'Amazon Choice': 'Yes' if amazon_choice else 'No',
95 'Variations': variations,
96 'Attributes': attributes,
97 'Description': description.strip() if type(description) == str else description,
98 'BulletPoints': bulletpoints,
99 'BulletPoints Count': len(bulletpoints),
100 'Important Info': important_info,
101 'Climate Friendly': climate_friendly.strip() if type(climate_friendly) == str else '',
102 'Has A+ Content': aplus,
103 'Stock Status': stock_status,
104 'Seller': seller,
105 'Rating': rating,
106 'Reviews': reviews,
107 'Reviews Terms': reviews_terms,
108 'Product Information': product_information,
109 'Location': location,
110 'QA': qa
111 }
112
113 def extract_seller(self, response):
114 seller = response.xpath('//div[@id="merchant-info"]/*[@id="sellerProfileTriggerId"]/text()').extract_first()
115
116 if seller is None:
117 seller = response.xpath('//div[@id="merchant-info"]/text()').extract_first()
118
119 return seller.strip()
120
121 def extract_stock_status(self, response):
122 out_of_stock = response.xpath('//div[@id="outOfStock"]/text()').extract_first()
123 if out_of_stock:
124 return 'Out of stock'
125
126 status = ''
127 statuses = response.xpath('//div[@id="availability"]//text()').extract()
128 for x in statuses:
129 if len(x.strip()) > 0:
130 status = x.strip()
131 break
132
133 return status
134
135 def extract_product_attributes(self, response):
136 attributes = {}
137
138 rows = response.xpath('//div[@id="productOverview_feature_div"]//tr').extract()
139
140 for row in rows:
141 dom = BeautifulSoup(row, 'html.parser')
142 key = dom.find_all('td')[0].text.strip()
143 value = dom.find_all('td')[1].text.strip()
144
145 attributes[key] = value
146
147 return attributes
148
149 def extract_bulletpoints(self, response):
150 items = response.xpath('//div[@id="featurebullets_feature_div"]//ul/li//text()').extract()
151
152 items = list(map(lambda x: x.strip(), items))
153
154 return items
155
156 def extract_reviews(self, response):
157 reviews = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first()
158 if reviews is None:
159 return 0
160
161 return reviews.split(' ')[0]
162
163 def extract_reviews_terms(self, response):
164 state = response.xpath('//*[@id="cr-state-object"]/@data-state').extract_first()
165 if state is None:
166 return ''
167
168 state = json.loads(state)
169
170 params = {
171 'asin': state['asin'],
172 'language': 'en_GB',
173 'lazyWidget': 'cr-summarization-lighthut',
174 'csrf': state['lazyWidgetCsrfToken']
175 }
176
177 response = requests.get('https://www.amazon.co.uk/hz/reviews-render/ajax/lazy-widgets/stream', params=params)
178
179 if response.status_code != 200:
180 return ''
181
182 data = []
183
184 try:
185 jsonText = response.text.replace('&&&', '').strip()
186 obj = json.loads(jsonText)
187 dom = BeautifulSoup(obj[2], 'html.parser')
188 terms = dom.select('[data-hook="lighthut-terms-list"] .cr-lighthouse-terms > span')
189 for term in terms:
190 data.append(term.text.strip())
191 except:
192 pass
193
194 return data
195
196 def extract_qa(self, response):
197 try:
198 state = response.xpath('//*[@id="cr-state-object"]/@data-state').extract_first()
199 if state is None:
200 return ''
201
202 state = json.loads(state)
203
204 params = {
205 'asin': state['asin'],
206 '_': time.time()
207 }
208
209 resp = requests.get('https://www.amazon.co.uk/gp/ask-widget/askWidget.html', params=params)
210
211 dom = BeautifulSoup(resp.content, 'html.parser')
212 result = []
213 items = dom.select('.askTeaserQuestions > div')
214
215 for item in items:
216 q = item.find('div', id=lambda x: x and x.startswith('question'))
217 a = item.select(
218 '.a-fixed-left-grid-inner .a-fixed-left-grid-col.a-col-right .a-fixed-left-grid.a-spacing-base .a-fixed-left-grid-col.a-col-right')
219
220 question = ''
221 answer = ''
222
223 if q:
224 question = q.find('a').text.strip()
225
226 if len(a) and a[0].find('span', {'class': 'askLongTexts'}) is not None:
227 answer = a[0].find('span', {'class': 'askLongText'})
228 answer = answer.text.strip()
229 answer = answer.replace('see less', '').strip()
230 elif len(a):
231 ans = a[0].select('noscript ~ span')
232 if len(ans) > 0:
233 answer = ans[0].text.strip()
234
235 result.append({
236 'question': question,
237 'answer': answer
238 })
239
240 return result
241 except Exception as e:
242 print(str(e))
243 return []
244
245 def extract_rating(self, response):
246 rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first()
247
248 if rating is None:
249 return 0
250
251 return rating.split(' ')[0]
252
253 def extract_images(self, response):
254 images = []
255 match = re.search("'colorImages': {([\s\S]+?)\]},\n", response.text, re.MULTILINE)
256 jsonObj = json.loads("{" + match.group(1).replace("'", '"') + "]}")
257
258 for img in jsonObj['initial']:
259 images.append(img['hiRes'])
260
261 return images
262
263 def extract_brand(self, response):
264 brand = response.xpath('//td/span[text()="Brand"]/../following-sibling::td/span/text()').extract_first()
265 if brand is not None:
266 return brand
267
268 brand = response.xpath('//*[@id="bylineInfo"]/text()').extract_first()
269 return brand.strip().split(' ')[1] if brand is not None else ''
270
271 def extract_product_information(self, response):
272 extra_information = []
273
274 product_information_rows = response.xpath('//table[@id="productDetails_techSpec_section_1"]//tr').extract()
275
276 for row in product_information_rows:
277 dom = BeautifulSoup(row, 'html.parser')
278 header = dom.find('th').text.strip()
279 value = dom.find('td').text.strip()
280
281 extra_information.append({
282 'key': header,
283 'value': value
284 })
285
286 additional_information_rows = response.xpath('//table[@id="productDetails_detailBullets_sections1"]//tr').extract()
287
288 for row in additional_information_rows:
289 dom = BeautifulSoup(row, 'html.parser')
290 header = dom.find('th').text.strip()
291
292 if header == 'ASIN':
293 extra_information.append({
294 'key': 'ASIN',
295 'value': dom.find('td').text.strip()
296 })
297 elif header == 'Best Sellers Rank':
298 rankings = dom.find('td').text.strip()
299 rankings = re.sub("\(.*\)", '', rankings).replace("\n", ' ')
300 extra_information.append({
301 'key': header,
302 'value': rankings
303 })
304 elif header == 'Date First Available':
305 date = dom.find('td').text.strip()
306 extra_information.append({
307 'key': header,
308 'value': date
309 })
310
311 return extra_information