· 4 years ago · Feb 06, 2021, 10:32 AM
1import scrapy
2import re
3
4
5class CompaniesSpider(scrapy.Spider):
6 name = 'companies'
7 allowed_domains = ['yandex.ru']
8 start_urls = ['http://yandex.ru/']
9
10 def start_requests(self):
11 test_region = 'moscow'
12 start_url = 'https://yandex.ru/maps/213/moscow/search/%D0%9E%D1%85%D1%80%D0%B0%D0%BD%D0%BD%D0%BE%D0%B5%20%D0%BF%D1%80%D0%B5%D0%B4%D0%BF%D1%80%D0%B8%D1%8F%D1%82%D0%B8%D0%B5/' \
13 '?ll=37.385524%2C55.584222&sll=37.385524%2C55.584222' \
14 '&sspn=7.888184%2C1.166794&z=9'
15 yield scrapy.Request(start_url,
16 callback=self.crawl_first_page)
17
18 def crawl_first_page(self, response):
19 # "requestContext":" ---> ctx
20 # "csrfToken":" ---> csrfToken
21 # "requestSerpId":"1 ---> parentReqId
22 # "sessionId":" ---> cookies_session_id
23 print(response.request.cookies)
24 print()
25 print(response.request.headers)
26 print()
27 print(response.headers)
28 print()
29 ctx = re.search(r"\"requestContext\":\"(\S+?)\"", response.text).group(1)
30 csrf = re.search(r"\"csrfToken\":\"(\S+?)\"", response.text).group(1)
31 parent_reqid = re.search(r"\"requestSerpId\":\"(\S+?)\"", response.text).group(1)
32 cookies_session_id = re.search(r"\"sessionId\":\"(\S+?)\"", response.text).group(1)
33 ll = re.search(r"\"ll\":\"(\S+?)\"", response.text).group(1)
34 spn = re.search(r"\"sspn\":\"(\S+?)\"", response.text).group(1)
35
36 # to parse "html" response:
37 # cut out with re --> "items":[{ ....
38 base_url = f'https://yandex.ru/maps/api/search?add_type=direct' \
39 f'&ajax=1' \
40 f'&csrfToken={csrf}' \
41 f'&ctx={ctx}' \
42 f'&direct_page_id=242' \
43 f'&lang=ru_RU' \
44 f'&ll={ll}' \
45 f'&origin=maps-scroll' \
46 f'&parent_reqid={parent_reqid}' \
47 f'&results=25' \
48 f'&s=2616087341' \
49 f'&search_experimental_rearr%5B0%5D=scheme_Local%2FGeo%2FAdverts%2FInjectionV2%2FMaxadvTopMix%2FAdvertScoreMultiplier%3D1.05' \
50 f'&search_experimental_rearr%5B1%5D=scheme_Local%2FGeo%2FAdverts%2FInjectionV2%2FMaxadvTopMix%2FScore%3D%27GudiniPScore%27' \
51 f'&search_experimental_rearr%5B2%5D=scheme_Local%2FGeo%2FAdverts%2FInjectionV2%2FMaxadvTopMix%2FEnabled%3D1' \
52 f'&serpid={parent_reqid}' \
53 f'&sessionId={cookies_session_id}' \
54 f'&skip=0' \
55 f'&snippets=masstransit%2F2.x%2Cpanoramas%2F1.x%2Cbusinessrating%2F1.x%2Cbusinessimages%2F1.x%2Cphotos%2F2.x%2Cfuel%2F1.x%2Crealty_experimental%2F2.x%2Cexperimental%2F1.x%2Csubtitle%2F1.x%2Cexchange%2F1.x%2Cmatchedobjects%2F1.x%2Cdiscovery%2F1.x%2Cvisits_histogram%2F2.x%2Ctopobjects%2F1.x%2Cshowtimes%2F1.x%2Cpromo_mastercard%2F1.x%3Amastercardoffers%2Ctycoon_owners_personal%2F1.x%2Ctycoon_posts%2F1.x%2Crelated_adverts%2F1.x%2Crelated_adverts_1org%2F1.x%2Ccity_chains%2F1.x%2Croute_point%2F1.x%2Ctopplaces%2F1.x%2Cmetrika_snippets%2F1.x%2Cafisha_json_geozen%2F1.x%2Cplace_summary%2F1.x%2Cencyclopedia%2F1.x%2Conline_snippets%2F1.x%2Cbuilding_info_experimental%2F1.x%2Cprovider_data%2F1.x%2Cservice_orgs_experimental%2F1.x%2Cgeosmb%2F1.x' \
56 f'&spn={spn}' \
57 f'&test-buckets=182561%2C0%2C77%3B325435%2C0%2C50%3B325618%2C0%2C29%3B259954%2C0%2C88%3B325544%2C0%2C47%3B318121%2C0%2C97%3B325255%2C0%2C48%3B326601%2C0%2C83%3B221199%2C0%2C79%3B204298%2C0%2C33%3B323316%2C0%2C54%3B324296%2C0%2C74%3B318450%2C0%2C41%3B313181%2C0%2C39' \
58 f'&text=%D0%9E%D1%85%D1%80%D0%B0%D0%BD%D0%BD%D0%BE%D0%B5%20%D0%BF%D1%80%D0%B5%D0%B4%D0%BF%D1%80%D0%B8%D1%8F%D1%82%D0%B8%D0%B5' \
59 f'&yandex_gid=191' \
60 f'&z=9'
61
62 r = scrapy.Request(base_url,
63 callback=self.crawl_api)
64 yield r