· 3 years ago · Jan 07, 2022, 04:30 PM
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3import requests
4import json
5import sys
6from pprint import pprint
7from html_to_json import convert as cnv
8from random import randint
9from lxml.html.clean import Cleaner
10import lxml
11import re
12from bs4 import BeautifulSoup as bs
13from timeit import timeit
14import os
15from glob import glob
16headers = {
17 'authority': 'yandex.ru',
18 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
19 'device-memory': '8',
20 'rtt': '150',
21 'sec-ch-ua-mobile': '?0',
22 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
23 'viewport-width': '575',
24 'dpr': '1',
25 'downlink': '4.15',
26 'ect': '4g',
27 'sec-ch-ua-platform': '"Windows"',
28 'accept': 'application/json',
29 'sec-fetch-site': 'same-origin',
30 'sec-fetch-mode': 'cors',
31 'sec-fetch-dest': 'empty',
32 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
33}
34
35
36def info(id):
37 params = (
38 ('docid', f'{id}'),
39 ('lang', 'ru'),
40 ('mt', '1'),
41 ('family', '0'),
42 ('pornowhitelist', '1'),
43 ('ipnd', '1'),
44 )
45
46 response = requests.get('https://yandex.ru/images-apphost/rim',
47 headers=headers, params=params).json()
48 return response
49
50
51def load_image(byte):
52 headers = {
53 'authority': 'yandex.ru',
54 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
55 'device-memory': '8',
56 'rtt': '200',
57 'sec-ch-ua-mobile': '?0',
58 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
59 'viewport-width': '794',
60 'content-type': 'image/jpeg',
61 'dpr': '1',
62 'downlink': '2.65',
63 'ect': '4g',
64 'sec-ch-ua-platform': '"Windows"',
65 'accept': '*/*',
66 'origin': 'https://yandex.ru',
67 'sec-fetch-site': 'same-origin',
68 'sec-fetch-mode': 'cors',
69 'sec-fetch-dest': 'empty',
70 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
71 }
72
73 params = (
74 ('cbird', '37'),
75 ('images_avatars_size', 'preview'),
76 ('images_avatars_namespace', 'images-cbir')
77 )
78
79 data = byte
80 response = requests.post('https://yandex.ru/images-apphost/image-download',
81 headers=headers, params=params, data=data).json()
82 print(response)
83 return response
84
85
86def getInfoImage(url):
87 headers = {
88 'authority': 'yandex.ru',
89 'cache-control': 'max-age=0',
90 'device-memory': '8',
91 'dpr': '1',
92 'viewport-width': '1280',
93 'rtt': '200',
94 'downlink': '2.2',
95 'ect': '4g',
96 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
97 'sec-ch-ua-mobile': '?0',
98 'sec-ch-ua-platform': '"Windows"',
99 'upgrade-insecure-requests': '1',
100 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
101 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
102 'sec-fetch-site': 'same-origin',
103 'sec-fetch-mode': 'navigate',
104 'sec-fetch-user': '?1',
105 'sec-fetch-dest': 'document',
106 'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
107 }
108 if isinstance(dict, type(url)):
109 params = (
110 ('url', "".join(url['url'].split("/")[:-2]) + "orig"),
111 ('cbir_id', url['url'].split("get-images-cbir/")
112 [-1].split("/preview")[0]),
113 ('cbir_page', 'similar'),
114 ('rpt', 'imageview'),
115 ('family', '0'),
116 ('pornowhitelist', '1'),
117 ('ipnd', '1'),
118 )
119 elif not url.startswith("http"):
120 params = (
121 ('text', url),
122 ('from', 'tabbar'),
123 ('family', '0'),
124 ('pornowhitelist', '1'),
125 ('ipnd', '1'),
126 )
127 else:
128 params = (
129 ('url', url),
130 ('cbir_page', 'similar'),
131 ('rpt', 'imageview'),
132 ('family', '0'),
133 ('pornowhitelist', '1'),
134 ('ipnd', '1'),
135 )
136 response = requests.get('https://yandex.ru/images/search',
137 headers=headers, params=params)
138 print(response.url)
139 root = lxml.html.fromstring(response.content)
140 data = list(root.xpath('//*[@id]/@data-bem'))
141 for i in data:
142 i = json.loads(i)
143 if "serp-item" in i:
144 if "rimId" in i["serp-item"]:
145 yield i["serp-item"]["rimId"]
146
147
148def sJson(response, name):
149 with open(f"{name}.html", "w", encoding="utf-8")as f:
150 cleaner = Cleaner(style=True, scripts=True, javascript=True, inline_style=True, links=True, add_nofollow=False,
151 page_structure=True, safe_attrs_only=False)
152 f.write(cleaner.clean_html(response))
153
154
155all_links = []
156
157
158def vldc(elem):
159 try:
160 requests.get(elem)
161 except:
162 return False
163
164
165def map_append(elem):
166 all_links.append(elem["iu"])
167
168
169def get_from_dict(all_links, response):
170 for num, i in enumerate(response["rld"]):
171 infos = i["s"]
172 map(map_append, infos)
173
174
175def Glob_matching(src):
176 prt = glob("*.*")
177 if src in prt:
178 return True
179
180def links_yd(uri):
181 if isinstance(str, type(uri)) and uri.startswith("C:") or Glob_matching(uri):
182 with open(uri, "rb") as image:
183 f = image.read()
184 try:
185 response = info(getInfoImage(load_image(f)))
186 for l in response:
187 for num, i in enumerate(response["rld"]):
188 infos = i["s"]
189 for i in infos:
190 all_links.append(i["iu"])
191 if "id" in i:
192 get_from_dict(all_links, info(i["id"]))
193
194 return all_links
195 except:
196 return None
197 elif isinstance(bytes, type(uri)):
198 f = uri
199
200 response = info(getInfoImage(load_image(f)))
201 for l in response:
202 for num, i in enumerate(response["rld"]):
203 infos = i["s"]
204 for i in infos:
205 all_links.append(i["iu"])
206 if "id" in i:
207 get_from_dict(all_links, info(i["id"]))
208
209 return all_links
210 else:
211 response = map(info,getInfoImage(uri))
212 for l in response:
213 for num, i in enumerate(l["rld"]):
214 infos = i["s"]
215 for i in infos:
216 all_links.append(i["iu"])
217 if "id" in i:
218 get_from_dict(all_links, info(i["id"]))
219
220 return all_links
221
222
223print(links_yd("cats"))
224