· 5 years ago · Jul 01, 2020, 10:22 AM
1function use_crawlera(splash)
2 -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
3 -- Have a look at the file spiders/quotes-js.py to see how to do it.
4 -- Find your Crawlera credentials in https://app.scrapinghub.com/
5 local user = splash.args.crawlera_user
6
7 local host = 'proxy.crawlera.com'
8 local port = '8010'
9 local session_header = 'X-Crawlera-Session'
10 local session_id = 'create'
11
12 splash:on_request(function (request)
13 -- The commented code below can be used to speed up the crawling
14 -- process. They filter requests to undesired domains and useless
15 -- resources. Uncomment the ones that make sense to your use case
16 -- and add your own rules.
17
18 -- Discard requests to advertising and tracking domains.
19 if string.find(request.url, 'doubleclick%.net') or
20 string.find(request.url, 'analytics%.google%.com') or
21 string.find(request.url, 'ssl.google%.analytics%.com') or
22 string.find(request.url, 'facebook%.com') or
23 string.find(request.url, 'bat.bing.com') then
24 request.abort()
25 return
26 end
27
28 -- Avoid using Crawlera for subresources fetching to increase crawling
29 -- speed. The example below avoids using Crawlera for URLS starting
30 -- with 'static.' and the ones ending with '.png'.
31 if string.find(request.url, '://static%.') ~= nil or
32 string.find(request.url, '%.png$') ~= nil then
33 return
34 end
35 request:set_header('X-Crawlera-profile', 'desktop')
36 request:set_header('X-Crawlera-Cookies', 'disable')
37 request:set_header(session_header, session_id)
38 request:set_proxy{host, port, username=user, password=''}
39 end)
40
41 splash:on_response_headers(function (response)
42 if type(response.headers[session_header]) ~= nil then
43 session_id = response.headers[session_header]
44 end
45 end)
46end
47
48function main(splash)
49 splash.images_enabled = false
50 splash.private_mode_enabled = false
51 splash.resource_timeout = 20
52 use_crawlera(splash)
53 local num_scrolls = splash.args.count
54 local scroll_delay = 5.0
55
56 local scroll_by = splash:jsfunc("window.scrollBy")
57 local get_body_height = splash:jsfunc(
58 "function() {return document.body.scrollHeight;}"
59 )
60
61 assert(splash:go(splash.args.url))
62 splash:wait(10)
63
64 for _ = 1, num_scrolls do
65 scroll_by(0, get_body_height())
66 splash:wait(scroll_delay)
67 end
68 return splash:html()
69end