eJbuTkzt

· 4 years ago · Apr 29, 2021, 03:08 PM
1# -*- coding: utf-8 -*-
2# https://github.com/ian-kerins/amazon-python-scrapy-scraper
3
4# cd amazon_scrapy
5# scrapy crawl amazon -o amazon_test_FR2.csv
6
7import scrapy
8from bs4 import BeautifulSoup
9from urllib.parse import urlencode, unquote
10import re
11import json
12import requests
13import time
14
15#INPUTS - update file name and country code
16
17# file = open('urls_fr.txt', 'r')
18#
19# queries = file.read()
20#
21# file.close()
22# queries = queries.split('\n')
23queries = ['https://www.amazon.co.uk/dp/B07G3Q9JWT/']
24
25row_count = sum(1 for row in queries)
26
27print('\nURL LIST HAS ' + str(row_count) + ' ROWS\n')
28
29API = '9165dfdd318c0570418a52c726f10390' ##Insert Scraperapi API key here. Signup here for free trial with 5,000 requests: https://www.scraperapi.com/signup
30
31
32# United States (us)  Canada (ca), United Kingdom (uk), Germany (de), France (fr),
33# Spain (es), Brazil (br), Mexico (mx), India (in), Japan (jp), China (cn), and Australia (au)
34
35def get_url(url):
36    payload = {'api_key': API, 'url': url, 'country_code': 'fr', 'render': 'true'}
37    proxy_url = 'http://api.scraperapi.com/?'+urlencode(payload)
38    return proxy_url
39
40
41class AmazonSpider(scrapy.Spider):
42    name = 'amazon'
43
44    def start_requests(self):
45        for query in queries:
46            yield scrapy.Request(url=get_url(query), callback=self.parse_product_page)
47
48    def parse_product_page(self, response):
49        title = response.xpath('//*[@id="productTitle"]/text()').extract_first().strip()
50        price = response.xpath('//*[@id="priceblock_ourprice"]/text()').extract_first()
51
52        # question = response.xpath('a-section askTeaserQuestions a [data-action="ask-log-click-csm"]/text()').extract_first()
53        # answer = response.xpath('.askTeaserQuestions a [data-action="ask-log-click-csm"]/text()').extract_first()
54        brand = self.extract_brand(response)
55
56        description = response.xpath('//*[@id="productDescription"]/p/text()').extract_first()
57        variations = response.xpath('//div[@id="variation_size_name"]/ul/li//text()').extract()
58        variations = list(filter(lambda x: x[0] != "\n", variations))
59        images = self.extract_images(response)
60        stock_status = self.extract_stock_status(response)
61        seller = self.extract_seller(response)
62        location = response.xpath('//*[@id="glow-ingress-line2"]/text()').extract_first().strip()
63        amazon_choice = response.xpath('//*[@id="acBadge_feature_div"]//*[contains(text(), "Amazon\'s Choice")]')\
64            .extract()
65        amazon_choice = len(amazon_choice) > 0
66
67        attributes = self.extract_product_attributes(response)
68        bulletpoints = self.extract_bulletpoints(response)
69        has_video = len(response.xpath('//*[contains(@class, "videoBlockIngress")]').extract()) > 0
70        important_info = response.xpath('//div[@id="important-information"]//p/text()').extract_first()
71        climate_friendly = response.xpath('//*[@id="climatePledgeFriendly"]//p/text()').extract_first()
72        aplus = len(response.xpath('//*[contains(@class, "aplus-v2")]').extract()) > 0
73
74
75        rating = self.extract_rating(response)
76        reviews = self.extract_reviews(response)
77        reviews_terms = self.extract_reviews_terms(response)
78        product_information = self.extract_product_information(response)
79
80        if not price:
81            price = response.xpath('//*[@data-asin-price]/@data-asin-price').extract_first() or \
82                    response.xpath('//*[@id="price_inside_buybox"]/text()').extract_first()
83
84        qa = self.extract_qa(response)
85
86        yield {
87            'Title': title,
88            'URL': unquote(response.request.url.replace('http://api.scraperapi.com/?api_key=9165dfdd318c0570418a52c726f10390&url=', '')).split('&')[0],
89            'Price': price.strip() if type(price) == str else price,
90            'Brand': brand,
91            '2nd Images': images,
92            '2nd Image Count': len(images),
93            'Has Video': 'Yes' if has_video else 'No',
94            'Amazon Choice': 'Yes' if amazon_choice else 'No',
95            'Variations': variations,
96            'Attributes': attributes,
97            'Description': description.strip() if type(description) == str else description,
98            'BulletPoints': bulletpoints,
99            'BulletPoints Count': len(bulletpoints),
100            'Important Info': important_info,
101            'Climate Friendly': climate_friendly.strip() if type(climate_friendly) == str else '',
102            'Has A+ Content': aplus,
103            'Stock Status': stock_status,
104            'Seller': seller,
105            'Rating': rating,
106            'Reviews': reviews,
107            'Reviews Terms': reviews_terms,
108            'Product Information': product_information,
109            'Location': location,
110            'QA': qa
111        }
112
113    def extract_seller(self, response):
114        seller = response.xpath('//div[@id="merchant-info"]/*[@id="sellerProfileTriggerId"]/text()').extract_first()
115
116        if seller is None:
117            seller = response.xpath('//div[@id="merchant-info"]/text()').extract_first()
118
119        return seller.strip()
120
121    def extract_stock_status(self, response):
122        out_of_stock = response.xpath('//div[@id="outOfStock"]/text()').extract_first()
123        if out_of_stock:
124            return 'Out of stock'
125
126        status = ''
127        statuses = response.xpath('//div[@id="availability"]//text()').extract()
128        for x in statuses:
129            if len(x.strip()) > 0:
130                status = x.strip()
131                break
132
133        return status
134
135    def extract_product_attributes(self, response):
136        attributes = {}
137
138        rows = response.xpath('//div[@id="productOverview_feature_div"]//tr').extract()
139
140        for row in rows:
141            dom = BeautifulSoup(row, 'html.parser')
142            key = dom.find_all('td')[0].text.strip()
143            value = dom.find_all('td')[1].text.strip()
144
145            attributes[key] = value
146
147        return attributes
148
149    def extract_bulletpoints(self, response):
150        items = response.xpath('//div[@id="featurebullets_feature_div"]//ul/li//text()').extract()
151
152        items = list(map(lambda x: x.strip(), items))
153
154        return items
155
156    def extract_reviews(self, response):
157        reviews = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first()
158        if reviews is None:
159            return 0
160
161        return reviews.split(' ')[0]
162
163    def extract_reviews_terms(self, response):
164        state = response.xpath('//*[@id="cr-state-object"]/@data-state').extract_first()
165        if state is None:
166            return ''
167
168        state = json.loads(state)
169
170        params = {
171            'asin': state['asin'],
172            'language': 'en_GB',
173            'lazyWidget': 'cr-summarization-lighthut',
174            'csrf': state['lazyWidgetCsrfToken']
175        }
176
177        response = requests.get('https://www.amazon.co.uk/hz/reviews-render/ajax/lazy-widgets/stream', params=params)
178
179        if response.status_code != 200:
180            return ''
181
182        data = []
183
184        try:
185            jsonText = response.text.replace('&&&', '').strip()
186            obj = json.loads(jsonText)
187            dom = BeautifulSoup(obj[2], 'html.parser')
188            terms = dom.select('[data-hook="lighthut-terms-list"] .cr-lighthouse-terms > span')
189            for term in terms:
190                data.append(term.text.strip())
191        except:
192            pass
193
194        return data
195
196    def extract_qa(self, response):
197        try:
198            state = response.xpath('//*[@id="cr-state-object"]/@data-state').extract_first()
199            if state is None:
200                return ''
201
202            state = json.loads(state)
203
204            params = {
205                'asin': state['asin'],
206                '_': time.time()
207            }
208
209            resp = requests.get('https://www.amazon.co.uk/gp/ask-widget/askWidget.html', params=params)
210
211            dom = BeautifulSoup(resp.content, 'html.parser')
212            result = []
213            items = dom.select('.askTeaserQuestions > div')
214
215            for item in items:
216                q = item.find('div', id=lambda x: x and x.startswith('question'))
217                a = item.select(
218                    '.a-fixed-left-grid-inner .a-fixed-left-grid-col.a-col-right .a-fixed-left-grid.a-spacing-base .a-fixed-left-grid-col.a-col-right')
219
220                question = ''
221                answer = ''
222
223                if q:
224                    question = q.find('a').text.strip()
225
226                if len(a) and a[0].find('span', {'class': 'askLongTexts'}) is not None:
227                    answer = a[0].find('span', {'class': 'askLongText'})
228                    answer = answer.text.strip()
229                    answer = answer.replace('see less', '').strip()
230                elif len(a):
231                    ans = a[0].select('noscript ~ span')
232                    if len(ans) > 0:
233                        answer = ans[0].text.strip()
234
235                result.append({
236                    'question': question,
237                    'answer': answer
238                })
239
240            return result
241        except Exception as e:
242            print(str(e))
243            return []
244
245    def extract_rating(self, response):
246        rating = response.xpath('//*[@id="acrPopover"]/@title').extract_first()
247
248        if rating is None:
249            return 0
250
251        return rating.split(' ')[0]
252
253    def extract_images(self, response):
254        images = []
255        match = re.search("'colorImages': {([\s\S]+?)\]},\n", response.text, re.MULTILINE)
256        jsonObj = json.loads("{" + match.group(1).replace("'", '"') + "]}")
257
258        for img in jsonObj['initial']:
259            images.append(img['hiRes'])
260
261        return images
262
263    def extract_brand(self, response):
264        brand = response.xpath('//td/span[text()="Brand"]/../following-sibling::td/span/text()').extract_first()
265        if brand is not None:
266            return brand
267
268        brand = response.xpath('//*[@id="bylineInfo"]/text()').extract_first()
269        return brand.strip().split(' ')[1] if brand is not None else ''
270
271    def extract_product_information(self, response):
272        extra_information = []
273
274        product_information_rows = response.xpath('//table[@id="productDetails_techSpec_section_1"]//tr').extract()
275
276        for row in product_information_rows:
277            dom = BeautifulSoup(row, 'html.parser')
278            header = dom.find('th').text.strip()
279            value = dom.find('td').text.strip()
280
281            extra_information.append({
282                'key': header,
283                'value': value
284            })
285
286        additional_information_rows = response.xpath('//table[@id="productDetails_detailBullets_sections1"]//tr').extract()
287
288        for row in additional_information_rows:
289            dom = BeautifulSoup(row, 'html.parser')
290            header = dom.find('th').text.strip()
291
292            if header == 'ASIN':
293                extra_information.append({
294                    'key': 'ASIN',
295                    'value': dom.find('td').text.strip()
296                })
297            elif header == 'Best Sellers Rank':
298                rankings = dom.find('td').text.strip()
299                rankings = re.sub("\(.*\)", '', rankings).replace("\n", ' ')
300                extra_information.append({
301                    'key': header,
302                    'value': rankings
303                })
304            elif header == 'Date First Available':
305                date = dom.find('td').text.strip()
306                extra_information.append({
307                    'key': header,
308                    'value': date
309                })
310
311        return extra_information