U6pQt2P9

· 6 years ago · Apr 01, 2020, 06:28 AM
1Lua Script:
2function use_crawlera(splash)
3    -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
4    -- Have a look at the file spiders/quotes-js.py to see how to do it.
5    -- Find your Crawlera credentials in https://app.scrapinghub.com/
6    local user = splash.args.crawlera_user
7 
8    local host = 'proxy.crawlera.com'
9    local port = 8010
10    local session_header = 'X-Crawlera-Session'
11    local session_id = 'create'
12 
13    splash:on_request(function (request)
14        -- The commented code below can be used to speed up the crawling
15        -- process. They filter requests to undesired domains and useless
16        -- resources. Uncomment the ones that make sense to your use case
17        -- and add your own rules.
18 
19        -- Discard requests to advertising and tracking domains.
20        -- if string.find(request.url, 'doubleclick%.net') or
21        --    string.find(request.url, 'analytics%.google%.com') then
22        --     request.abort()
23        --     return
24        -- end
25 
26        -- Avoid using Crawlera for subresources fetching to increase crawling
27        -- speed. The example below avoids using Crawlera for URLS starting
28        -- with 'static.' and the ones ending with '.png'.
29        if string.find(request.url, '://static%.') ~= nil or
30           string.find(request.url, '%.png$') ~= nil or
31           string.find(request.url, '%.css') ~=nil or
32           string.find(request.url, '%/xjs') ~=nil or
33           string.find(request.url, '%.js') ~=nil then
34             return
35        end
36       
37        request:set_header('X-Crawlera-Cookies', 'disable')
38        request:set_header('X-Crawlera-profile', 'desktop')
39		request:set_header('upgrade-insecure-requests','1')
40        request:set_header('upgrade-insecure-requests','1')
41        request:set_header('Connection','keep-alive')
42        request:set_header('DNT','1')
43        request:set_header('X-Crawlera-Timeout', '180000')
44        request:set_proxy{host, port, username=user, password=''}
45    end)
46 
47    splash:on_response_headers(function (response)
48        if type(response.headers[session_header]) ~= nil then
49            session_id = response.headers[session_header]
50        end
51    end)
52end
53 
54function main(splash)
55    splash.images_enabled = false
56    splash.private_mode_enabled = false
57   	
58	use_crawlera(splash)
59   	splash:set_user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36")
60    
61    splash:go(splash.args.url)
62    splash:wait(10)
63 
64    splash:set_viewport_full()
65    return {
66      jpeg=splash:jpeg(),
67      har=splash:har(),
68      html=splash:html()
69            }
70end
71 
72 
73Spider:
74 
75from pkgutil import get_data
76import scrapy
77from scrapy_splash import SplashRequest
78from w3lib.http import basic_auth_header
79import base64
80 
81
82 
83class Splashlua(scrapy.Spider):
84    name = "splash_example"
85    
86# I suggest moving these to Settings.py as they are SPLASH related settings and will be used only when SPLASHrequest is made.
87 	custom_settings = {
88        'DOWNLOAD_DELAY': 30.0,
89        'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
90                                   'scrapy_splash.SplashMiddleware': 725,
91                                   'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810},
92        'SPIDER_MIDDLEWARES': {
93            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
94        },
95        'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
96        'SPLASH_URL' : 'SPLASH URL',
97        'SPLASH_APIKEY' : 'SPLASH APIKEY'
98	}
99
100    def __init__(self, *args, **kwargs):
101        # to be able to load the Lua script on Scrapy Cloud, make sure your
102        # project's setup.py file contains the "package_data" setting, similar
103        # to this project's setup.py
104        self.LUA_SOURCE = get_data(
105            'hosted_splash', 'scripts/lua_example.lua'
106        ).decode('utf-8')
107        super(Splashlua, self).__init__(*args, **kwargs)
108 
109 
110    def start_requests(self):
111 
112        yield SplashRequest(url='https://nextdoor.com/pages/recology-american-canyon',
113 
114            callback=self.parse_category,endpoint="execute",
115            splash_headers={
116                'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
117            },
118            args={
119                'timeout' : '60',  
120                'lua_source': self.LUA_SOURCE,
121                'crawlera_user': self.settings['CRAWLERA_APIKEY']},
122               # splash_headers=self.splash_headers,
123                cache_args=['lua_source'],)
124 
125    def parse_category(self,response):
126        pass
127        jpeg = response.data['jpeg']
128        with open('hosted_immig3.jpeg', 'wb') as f:
129            f.write(base64.b64decode(jpeg))
130
131
132
133settings.py
134
135
136
137CRAWLERA_APIKEY = 'Crawlera APIkey'