· 5 years ago · Feb 07, 2021, 11:28 PM
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_parse_qs,
21 compat_urllib_parse_unquote,
22 compat_urllib_parse_unquote_plus,
23 compat_urllib_parse_urlencode,
24 compat_urllib_parse_urlparse,
25 compat_urlparse,
26 compat_str,
27)
28from ..utils import (
29 bool_or_none,
30 clean_html,
31 error_to_compat_str,
32 ExtractorError,
33 float_or_none,
34 get_element_by_id,
35 int_or_none,
36 mimetype2ext,
37 parse_codecs,
38 parse_duration,
39 remove_quotes,
40 remove_start,
41 smuggle_url,
42 str_or_none,
43 str_to_int,
44 try_get,
45 unescapeHTML,
46 unified_strdate,
47 unsmuggle_url,
48 update_url_query,
49 uppercase_escape,
50 url_or_none,
51 urlencode_postdata,
52 urljoin,
53)
54
55
56class YoutubeBaseInfoExtractor(InfoExtractor):
57 """Provide base functions for Youtube extractors"""
58 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
59 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
60
61 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
62 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
63 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
64
65 _NETRC_MACHINE = 'youtube'
66 # If True it will raise an error if no login info is provided
67 _LOGIN_REQUIRED = False
68
69 _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
70
71 def _set_language(self):
72 self._set_cookie(
73 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
74 # YouTube sets the expire time to about two months
75 expire_time=time.time() + 2 * 30 * 24 * 3600)
76
77 def _ids_to_results(self, ids):
78 return [
79 self.url_result(vid_id, 'Youtube', video_id=vid_id)
80 for vid_id in ids]
81
82 def _login(self):
83 """
84 Attempt to log in to YouTube.
85 True is returned if successful or skipped.
86 False is returned if login failed.
87
88 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
89 """
90 username, password = self._get_login_info()
91 # No authentication to be performed
92 if username is None:
93 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
94 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
95 return True
96
97 login_page = self._download_webpage(
98 self._LOGIN_URL, None,
99 note='Downloading login page',
100 errnote='unable to fetch login page', fatal=False)
101 if login_page is False:
102 return
103
104 login_form = self._hidden_inputs(login_page)
105
106 def req(url, f_req, note, errnote):
107 data = login_form.copy()
108 data.update({
109 'pstMsg': 1,
110 'checkConnection': 'youtube',
111 'checkedDomains': 'youtube',
112 'hl': 'en',
113 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
114 'f.req': json.dumps(f_req),
115 'flowName': 'GlifWebSignIn',
116 'flowEntry': 'ServiceLogin',
117 # TODO: reverse actual botguard identifier generation algo
118 'bgRequest': '["identifier",""]',
119 })
120 return self._download_json(
121 url, None, note=note, errnote=errnote,
122 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
123 fatal=False,
124 data=urlencode_postdata(data), headers={
125 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
126 'Google-Accounts-XSRF': 1,
127 })
128
129 def warn(message):
130 self._downloader.report_warning(message)
131
132 lookup_req = [
133 username,
134 None, [], None, 'US', None, None, 2, False, True,
135 [
136 None, None,
137 [2, 1, None, 1,
138 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
139 None, [], 4],
140 1, [None, None, []], None, None, None, True
141 ],
142 username,
143 ]
144
145 lookup_results = req(
146 self._LOOKUP_URL, lookup_req,
147 'Looking up account info', 'Unable to look up account info')
148
149 if lookup_results is False:
150 return False
151
152 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
153 if not user_hash:
154 warn('Unable to extract user hash')
155 return False
156
157 challenge_req = [
158 user_hash,
159 None, 1, None, [1, None, None, None, [password, None, True]],
160 [
161 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
162 1, [None, None, []], None, None, None, True
163 ]]
164
165 challenge_results = req(
166 self._CHALLENGE_URL, challenge_req,
167 'Logging in', 'Unable to log in')
168
169 if challenge_results is False:
170 return
171
172 login_res = try_get(challenge_results, lambda x: x[0][5], list)
173 if login_res:
174 login_msg = try_get(login_res, lambda x: x[5], compat_str)
175 warn(
176 'Unable to login: %s' % 'Invalid password'
177 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
178 return False
179
180 res = try_get(challenge_results, lambda x: x[0][-1], list)
181 if not res:
182 warn('Unable to extract result entry')
183 return False
184
185 login_challenge = try_get(res, lambda x: x[0][0], list)
186 if login_challenge:
187 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
188 if challenge_str == 'TWO_STEP_VERIFICATION':
189 # SEND_SUCCESS - TFA code has been successfully sent to phone
190 # QUOTA_EXCEEDED - reached the limit of TFA codes
191 status = try_get(login_challenge, lambda x: x[5], compat_str)
192 if status == 'QUOTA_EXCEEDED':
193 warn('Exceeded the limit of TFA codes, try later')
194 return False
195
196 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
197 if not tl:
198 warn('Unable to extract TL')
199 return False
200
201 tfa_code = self._get_tfa_info('2-step verification code')
202
203 if not tfa_code:
204 warn(
205 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
206 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
207 return False
208
209 tfa_code = remove_start(tfa_code, 'G-')
210
211 tfa_req = [
212 user_hash, None, 2, None,
213 [
214 9, None, None, None, None, None, None, None,
215 [None, tfa_code, True, 2]
216 ]]
217
218 tfa_results = req(
219 self._TFA_URL.format(tl), tfa_req,
220 'Submitting TFA code', 'Unable to submit TFA code')
221
222 if tfa_results is False:
223 return False
224
225 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
226 if tfa_res:
227 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
228 warn(
229 'Unable to finish TFA: %s' % 'Invalid TFA code'
230 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
231 return False
232
233 check_cookie_url = try_get(
234 tfa_results, lambda x: x[0][-1][2], compat_str)
235 else:
236 CHALLENGES = {
237 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
238 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
239 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
240 }
241 challenge = CHALLENGES.get(
242 challenge_str,
243 '%s returned error %s.' % (self.IE_NAME, challenge_str))
244 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
245 return False
246 else:
247 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
248
249 if not check_cookie_url:
250 warn('Unable to extract CheckCookie URL')
251 return False
252
253 check_cookie_results = self._download_webpage(
254 check_cookie_url, None, 'Checking cookie', fatal=False)
255
256 if check_cookie_results is False:
257 return False
258
259 if 'https://myaccount.google.com/' not in check_cookie_results:
260 warn('Unable to log in')
261 return False
262
263 return True
264
265 def _real_initialize(self):
266 if self._downloader is None:
267 return
268 self._set_language()
269 if not self._login():
270 return
271
272 _DEFAULT_API_DATA = {
273 'context': {
274 'client': {
275 'clientName': 'WEB',
276 'clientVersion': '2.20201021.03.00',
277 }
278 },
279 }
280
281 _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
282 _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
283 _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
284
285 def _call_api(self, ep, query, video_id):
286 data = self._DEFAULT_API_DATA.copy()
287 data.update(query)
288
289 response = self._download_json(
290 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
291 note='Downloading API JSON', errnote='Unable to download API page',
292 data=json.dumps(data).encode('utf8'),
293 headers={'content-type': 'application/json'},
294 query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
295
296 return response
297
298 def _extract_yt_initial_data(self, video_id, webpage):
299 return self._parse_json(
300 self._search_regex(
301 (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE),
302 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
303 video_id)
304
305 def _extract_ytcfg(self, video_id, webpage):
306 return self._parse_json(
307 self._search_regex(
308 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
309 default='{}'), video_id, fatal=False)
310
311
312class YoutubeIE(YoutubeBaseInfoExtractor):
313 IE_DESC = 'YouTube.com'
314 _VALID_URL = r"""(?x)^
315 (
316 (?:https?://|//) # http(s):// or protocol-independent URL
317 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
318 (?:www\.)?deturl\.com/www\.youtube\.com/|
319 (?:www\.)?pwnyoutube\.com/|
320 (?:www\.)?hooktube\.com/|
321 (?:www\.)?yourepeat\.com/|
322 tube\.majestyc\.net/|
323 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
324 (?:(?:www|dev)\.)?invidio\.us/|
325 (?:(?:www|no)\.)?invidiou\.sh/|
326 (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
327 (?:www\.)?invidious\.kabi\.tk/|
328 (?:www\.)?invidious\.13ad\.de/|
329 (?:www\.)?invidious\.mastodon\.host/|
330 (?:www\.)?invidious\.zapashcanon\.fr/|
331 (?:www\.)?invidious\.kavin\.rocks/|
332 (?:www\.)?invidious\.tube/|
333 (?:www\.)?invidiou\.site/|
334 (?:www\.)?invidious\.site/|
335 (?:www\.)?invidious\.xyz/|
336 (?:www\.)?invidious\.nixnet\.xyz/|
337 (?:www\.)?invidious\.drycat\.fr/|
338 (?:www\.)?tube\.poal\.co/|
339 (?:www\.)?tube\.connect\.cafe/|
340 (?:www\.)?vid\.wxzm\.sx/|
341 (?:www\.)?vid\.mint\.lgbt/|
342 (?:www\.)?yewtu\.be/|
343 (?:www\.)?yt\.elukerio\.org/|
344 (?:www\.)?yt\.lelux\.fi/|
345 (?:www\.)?invidious\.ggc-project\.de/|
346 (?:www\.)?yt\.maisputain\.ovh/|
347 (?:www\.)?invidious\.13ad\.de/|
348 (?:www\.)?invidious\.toot\.koeln/|
349 (?:www\.)?invidious\.fdn\.fr/|
350 (?:www\.)?watch\.nettohikari\.com/|
351 (?:www\.)?kgg2m7yk5aybusll\.onion/|
352 (?:www\.)?qklhadlycap4cnod\.onion/|
353 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
354 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
355 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
356 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
357 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
358 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
359 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
360 (?:.*?\#/)? # handle anchor (#/) redirect urls
361 (?: # the various things that can precede the ID:
362 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
363 |(?: # or the v= param in all its forms
364 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
365 (?:\?|\#!?) # the params delimiter ? or # or #!
366 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
367 v=
368 )
369 ))
370 |(?:
371 youtu\.be| # just youtu.be/xxxx
372 vid\.plus| # or vid.plus/xxxx
373 zwearz\.com/watch| # or zwearz.com/watch/xxxx
374 )/
375 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
376 )
377 )? # all until now is optional -> you can pass the naked ID
378 (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
379 (?!.*?\blist=
380 (?:
381 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
382 WL # WL are handled by the watch later IE
383 )
384 )
385 (?(1).+)? # if we found the ID, everything can follow
386 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
387 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
388 _PLAYER_INFO_RE = (
389 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
390 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
391 )
392 _formats = {
393 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
394 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
395 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
396 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
397 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
398 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
399 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
400 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
401 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
402 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
403 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
404 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
405 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
406 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
407 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
408 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
409 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
410 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
411
412
413 # 3D videos
414 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
415 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
416 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
417 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
418 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
419 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
420 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
421
422 # Apple HTTP Live Streaming
423 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
424 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
425 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
426 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
427 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
428 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
429 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
430 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
431
432 # DASH mp4 video
433 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
434 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
435 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
436 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
437 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
438 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
439 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
440 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
441 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
442 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
443 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
444 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
445
446 # Dash mp4 audio
447 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
448 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
449 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
450 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
451 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
452 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
453 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
454
455 # Dash webm
456 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
457 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
458 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
459 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
460 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
461 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
462 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
463 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
464 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
465 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
466 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
467 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
468 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
469 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
470 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
471 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
472 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
473 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
474 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
475 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
476 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
477 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
478
479 # Dash webm audio
480 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
481 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
482
483 # Dash webm audio with opus inside
484 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
485 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
486 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
487
488 # RTMP (unnamed)
489 '_rtmp': {'protocol': 'rtmp'},
490
491 # av01 video only formats sometimes served with "unknown" codecs
492 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
493 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
494 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
495 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
496 }
497 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
498
499 _GEO_BYPASS = False
500
501 IE_NAME = 'youtube'
502 _TESTS = [
503 {
504 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
505 'info_dict': {
506 'id': 'BaW_jenozKc',
507 'ext': 'mp4',
508 'title': 'youtube-dl test video "\'/\\äâ†ð•',
509 'uploader': 'Philipp Hagemeister',
510 'uploader_id': 'phihag',
511 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
512 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
513 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
514 'upload_date': '20121002',
515 'description': 'test chars: "\'/\\äâ†ð•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
516 'categories': ['Science & Technology'],
517 'tags': ['youtube-dl'],
518 'duration': 10,
519 'view_count': int,
520 'like_count': int,
521 'dislike_count': int,
522 'start_time': 1,
523 'end_time': 9,
524 }
525 },
526 {
527 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
528 'note': 'Embed-only video (#1746)',
529 'info_dict': {
530 'id': 'yZIXLfi8CZQ',
531 'ext': 'mp4',
532 'upload_date': '20120608',
533 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
534 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
535 'uploader': 'SET India',
536 'uploader_id': 'setindia',
537 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
538 'age_limit': 18,
539 }
540 },
541 {
542 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
543 'note': 'Use the first video ID in the URL',
544 'info_dict': {
545 'id': 'BaW_jenozKc',
546 'ext': 'mp4',
547 'title': 'youtube-dl test video "\'/\\äâ†ð•',
548 'uploader': 'Philipp Hagemeister',
549 'uploader_id': 'phihag',
550 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
551 'upload_date': '20121002',
552 'description': 'test chars: "\'/\\äâ†ð•\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
553 'categories': ['Science & Technology'],
554 'tags': ['youtube-dl'],
555 'duration': 10,
556 'view_count': int,
557 'like_count': int,
558 'dislike_count': int,
559 },
560 'params': {
561 'skip_download': True,
562 },
563 },
564 {
565 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
566 'note': '256k DASH audio (format 141) via DASH manifest',
567 'info_dict': {
568 'id': 'a9LDPn-MO4I',
569 'ext': 'm4a',
570 'upload_date': '20121002',
571 'uploader_id': '8KVIDEO',
572 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
573 'description': '',
574 'uploader': '8KVIDEO',
575 'title': 'UHDTV TEST 8K VIDEO.mp4'
576 },
577 'params': {
578 'youtube_include_dash_manifest': True,
579 'format': '141',
580 },
581 'skip': 'format 141 not served anymore',
582 },
583 # DASH manifest with encrypted signature
584 {
585 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
586 'info_dict': {
587 'id': 'IB3lcPjvWLA',
588 'ext': 'm4a',
589 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
590 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
591 'duration': 244,
592 'uploader': 'AfrojackVEVO',
593 'uploader_id': 'AfrojackVEVO',
594 'upload_date': '20131011',
595 },
596 'params': {
597 'youtube_include_dash_manifest': True,
598 'format': '141/bestaudio[ext=m4a]',
599 },
600 },
601 # Controversy video
602 {
603 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
604 'info_dict': {
605 'id': 'T4XJQO3qol8',
606 'ext': 'mp4',
607 'duration': 219,
608 'upload_date': '20100909',
609 'uploader': 'Amazing Atheist',
610 'uploader_id': 'TheAmazingAtheist',
611 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
612 'title': 'Burning Everyone\'s Koran',
613 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
614 }
615 },
616 # Normal age-gate video (No vevo, embed allowed), available via embed page
617 {
618 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
619 'info_dict': {
620 'id': 'HtVdAasjOgU',
621 'ext': 'mp4',
622 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
623 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
624 'duration': 142,
625 'uploader': 'The Witcher',
626 'uploader_id': 'WitcherGame',
627 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
628 'upload_date': '20140605',
629 'age_limit': 18,
630 },
631 },
632 {
633 # Age-gated video only available with authentication (unavailable
634 # via embed page workaround)
635 'url': 'XgnwCQzjau8',
636 'only_matching': True,
637 },
638 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
639 # YouTube Red ad is not captured for creator
640 {
641 'url': '__2ABJjxzNo',
642 'info_dict': {
643 'id': '__2ABJjxzNo',
644 'ext': 'mp4',
645 'duration': 266,
646 'upload_date': '20100430',
647 'uploader_id': 'deadmau5',
648 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
649 'creator': 'Dada Life, deadmau5',
650 'description': 'md5:12c56784b8032162bb936a5f76d55360',
651 'uploader': 'deadmau5',
652 'title': 'Deadmau5 - Some Chords (HD)',
653 'alt_title': 'This Machine Kills Some Chords',
654 },
655 'expected_warnings': [
656 'DASH manifest missing',
657 ]
658 },
659 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
660 {
661 'url': 'lqQg6PlCWgI',
662 'info_dict': {
663 'id': 'lqQg6PlCWgI',
664 'ext': 'mp4',
665 'duration': 6085,
666 'upload_date': '20150827',
667 'uploader_id': 'olympic',
668 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
669 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
670 'uploader': 'Olympic',
671 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
672 },
673 'params': {
674 'skip_download': 'requires avconv',
675 }
676 },
677 # Non-square pixels
678 {
679 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
680 'info_dict': {
681 'id': '_b-2C3KPAM0',
682 'ext': 'mp4',
683 'stretched_ratio': 16 / 9.,
684 'duration': 85,
685 'upload_date': '20110310',
686 'uploader_id': 'AllenMeow',
687 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
688 'description': 'made by Wacom from Korea | å—幕&åŠ æ²¹æ·»é†‹ by TY\'s Allen | 感è¬heylisa00cavey1001åŒå¸ç†±æƒ…æä¾›æ¢—åŠç¿»è¯',
689 'uploader': 'å«á„‹á„…',
690 'title': '[A-made] 變態å¦å—幕版 å¤ªå¦ æˆ‘å°±æ˜¯é€™æ¨£çš„äºº',
691 },
692 },
693 # url_encoded_fmt_stream_map is empty string
694 {
695 'url': 'qEJwOuvDf7I',
696 'info_dict': {
697 'id': 'qEJwOuvDf7I',
698 'ext': 'webm',
699 'title': 'ОбÑуждение Ñудебной практики по выборам 14 ÑентÑÐ±Ñ€Ñ 2014 года в Санкт-Петербурге',
700 'description': '',
701 'upload_date': '20150404',
702 'uploader_id': 'spbelect',
703 'uploader': 'Ðаблюдатели Петербурга',
704 },
705 'params': {
706 'skip_download': 'requires avconv',
707 },
708 'skip': 'This live event has ended.',
709 },
710 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
711 {
712 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
713 'info_dict': {
714 'id': 'FIl7x6_3R5Y',
715 'ext': 'webm',
716 'title': 'md5:7b81415841e02ecd4313668cde88737a',
717 'description': 'md5:116377fd2963b81ec4ce64b542173306',
718 'duration': 220,
719 'upload_date': '20150625',
720 'uploader_id': 'dorappi2000',
721 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
722 'uploader': 'dorappi2000',
723 'formats': 'mincount:31',
724 },
725 'skip': 'not actual anymore',
726 },
727 # DASH manifest with segment_list
728 {
729 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
730 'md5': '8ce563a1d667b599d21064e982ab9e31',
731 'info_dict': {
732 'id': 'CsmdDsKjzN8',
733 'ext': 'mp4',
734 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
735 'uploader': 'Airtek',
736 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
737 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
738 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
739 },
740 'params': {
741 'youtube_include_dash_manifest': True,
742 'format': '135', # bestvideo
743 },
744 'skip': 'This live event has ended.',
745 },
746 {
747 # Multifeed videos (multiple cameras), URL is for Main Camera
748 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
749 'info_dict': {
750 'id': 'jqWvoWXjCVs',
751 'title': 'teamPGP: Rocket League Noob Stream',
752 'description': 'md5:dc7872fb300e143831327f1bae3af010',
753 },
754 'playlist': [{
755 'info_dict': {
756 'id': 'jqWvoWXjCVs',
757 'ext': 'mp4',
758 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
759 'description': 'md5:dc7872fb300e143831327f1bae3af010',
760 'duration': 7335,
761 'upload_date': '20150721',
762 'uploader': 'Beer Games Beer',
763 'uploader_id': 'beergamesbeer',
764 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
765 'license': 'Standard YouTube License',
766 },
767 }, {
768 'info_dict': {
769 'id': '6h8e8xoXJzg',
770 'ext': 'mp4',
771 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
772 'description': 'md5:dc7872fb300e143831327f1bae3af010',
773 'duration': 7337,
774 'upload_date': '20150721',
775 'uploader': 'Beer Games Beer',
776 'uploader_id': 'beergamesbeer',
777 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
778 'license': 'Standard YouTube License',
779 },
780 }, {
781 'info_dict': {
782 'id': 'PUOgX5z9xZw',
783 'ext': 'mp4',
784 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
785 'description': 'md5:dc7872fb300e143831327f1bae3af010',
786 'duration': 7337,
787 'upload_date': '20150721',
788 'uploader': 'Beer Games Beer',
789 'uploader_id': 'beergamesbeer',
790 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
791 'license': 'Standard YouTube License',
792 },
793 }, {
794 'info_dict': {
795 'id': 'teuwxikvS5k',
796 'ext': 'mp4',
797 'title': 'teamPGP: Rocket League Noob Stream (zim)',
798 'description': 'md5:dc7872fb300e143831327f1bae3af010',
799 'duration': 7334,
800 'upload_date': '20150721',
801 'uploader': 'Beer Games Beer',
802 'uploader_id': 'beergamesbeer',
803 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
804 'license': 'Standard YouTube License',
805 },
806 }],
807 'params': {
808 'skip_download': True,
809 },
810 'skip': 'This video is not available.',
811 },
812 {
813 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
814 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
815 'info_dict': {
816 'id': 'gVfLd0zydlo',
817 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
818 },
819 'playlist_count': 2,
820 'skip': 'Not multifeed anymore',
821 },
822 {
823 'url': 'https://vid.plus/FlRa-iH7PGw',
824 'only_matching': True,
825 },
826 {
827 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
828 'only_matching': True,
829 },
830 {
831 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
832 # Also tests cut-off URL expansion in video description (see
833 # https://github.com/ytdl-org/youtube-dl/issues/1892,
834 # https://github.com/ytdl-org/youtube-dl/issues/8164)
835 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
836 'info_dict': {
837 'id': 'lsguqyKfVQg',
838 'ext': 'mp4',
839 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
840 'alt_title': 'Dark Walk - Position Music',
841 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
842 'duration': 133,
843 'upload_date': '20151119',
844 'uploader_id': 'IronSoulElf',
845 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
846 'uploader': 'IronSoulElf',
847 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
848 'track': 'Dark Walk - Position Music',
849 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
850 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
851 },
852 'params': {
853 'skip_download': True,
854 },
855 },
856 {
857 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
858 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
859 'only_matching': True,
860 },
861 {
862 # Video with yt:stretch=17:0
863 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
864 'info_dict': {
865 'id': 'Q39EVAstoRM',
866 'ext': 'mp4',
867 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
868 'description': 'md5:ee18a25c350637c8faff806845bddee9',
869 'upload_date': '20151107',
870 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
871 'uploader': 'CH GAMER DROID',
872 },
873 'params': {
874 'skip_download': True,
875 },
876 'skip': 'This video does not exist.',
877 },
878 {
879 # Video licensed under Creative Commons
880 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
881 'info_dict': {
882 'id': 'M4gD1WSo5mA',
883 'ext': 'mp4',
884 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
885 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
886 'duration': 721,
887 'upload_date': '20150127',
888 'uploader_id': 'BerkmanCenter',
889 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
890 'uploader': 'The Berkman Klein Center for Internet & Society',
891 'license': 'Creative Commons Attribution license (reuse allowed)',
892 },
893 'params': {
894 'skip_download': True,
895 },
896 },
897 {
898 # Channel-like uploader_url
899 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
900 'info_dict': {
901 'id': 'eQcmzGIKrzg',
902 'ext': 'mp4',
903 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
904 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
905 'duration': 4060,
906 'upload_date': '20151119',
907 'uploader': 'Bernie Sanders',
908 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
909 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
910 'license': 'Creative Commons Attribution license (reuse allowed)',
911 },
912 'params': {
913 'skip_download': True,
914 },
915 },
916 {
917 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',
918 'only_matching': True,
919 },
920 {
921 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
922 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
923 'only_matching': True,
924 },
925 {
926 # Rental video preview
927 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
928 'info_dict': {
929 'id': 'uGpuVWrhIzE',
930 'ext': 'mp4',
931 'title': 'Piku - Trailer',
932 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
933 'upload_date': '20150811',
934 'uploader': 'FlixMatrix',
935 'uploader_id': 'FlixMatrixKaravan',
936 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
937 'license': 'Standard YouTube License',
938 },
939 'params': {
940 'skip_download': True,
941 },
942 'skip': 'This video is not available.',
943 },
944 {
945 # YouTube Red video with episode data
946 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
947 'info_dict': {
948 'id': 'iqKdEhx-dD4',
949 'ext': 'mp4',
950 'title': 'Isolation - Mind Field (Ep 1)',
951 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
952 'duration': 2085,
953 'upload_date': '20170118',
954 'uploader': 'Vsauce',
955 'uploader_id': 'Vsauce',
956 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
957 'series': 'Mind Field',
958 'season_number': 1,
959 'episode_number': 1,
960 },
961 'params': {
962 'skip_download': True,
963 },
964 'expected_warnings': [
965 'Skipping DASH manifest',
966 ],
967 },
968 {
969 # The following content has been identified by the YouTube community
970 # as inappropriate or offensive to some audiences.
971 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
972 'info_dict': {
973 'id': '6SJNVb0GnPI',
974 'ext': 'mp4',
975 'title': 'Race Differences in Intelligence',
976 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
977 'duration': 965,
978 'upload_date': '20140124',
979 'uploader': 'New Century Foundation',
980 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
981 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
982 },
983 'params': {
984 'skip_download': True,
985 },
986 },
987 {
988 # itag 212
989 'url': '1t24XAntNCY',
990 'only_matching': True,
991 },
992 {
993 # geo restricted to JP
994 'url': 'sJL6WA-aGkQ',
995 'only_matching': True,
996 },
997 {
998 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
999 'only_matching': True,
1000 },
1001 {
1002 # DRM protected
1003 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1004 'only_matching': True,
1005 },
1006 {
1007 # Video with unsupported adaptive stream type formats
1008 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1009 'info_dict': {
1010 'id': 'Z4Vy8R84T1U',
1011 'ext': 'mp4',
1012 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1013 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1014 'duration': 433,
1015 'upload_date': '20130923',
1016 'uploader': 'Amelia Putri Harwita',
1017 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1018 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1019 'formats': 'maxcount:10',
1020 },
1021 'params': {
1022 'skip_download': True,
1023 'youtube_include_dash_manifest': False,
1024 },
1025 'skip': 'not actual anymore',
1026 },
1027 {
1028 # Youtube Music Auto-generated description
1029 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1030 'info_dict': {
1031 'id': 'MgNrAu2pzNs',
1032 'ext': 'mp4',
1033 'title': 'Voyeur Girl',
1034 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1035 'upload_date': '20190312',
1036 'uploader': 'Stephen - Topic',
1037 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1038 'artist': 'Stephen',
1039 'track': 'Voyeur Girl',
1040 'album': 'it\'s too much love to know my dear',
1041 'release_date': '20190313',
1042 'release_year': 2019,
1043 },
1044 'params': {
1045 'skip_download': True,
1046 },
1047 },
1048 {
1049 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1050 'only_matching': True,
1051 },
1052 {
1053 # invalid -> valid video id redirection
1054 'url': 'DJztXj2GPfl',
1055 'info_dict': {
1056 'id': 'DJztXj2GPfk',
1057 'ext': 'mp4',
1058 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1059 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1060 'upload_date': '20090125',
1061 'uploader': 'Prochorowka',
1062 'uploader_id': 'Prochorowka',
1063 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1064 'artist': 'Panjabi MC',
1065 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1066 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1067 },
1068 'params': {
1069 'skip_download': True,
1070 },
1071 },
1072 {
1073 # empty description results in an empty string
1074 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1075 'info_dict': {
1076 'id': 'x41yOUIvK2k',
1077 'ext': 'mp4',
1078 'title': 'IMG 3456',
1079 'description': '',
1080 'upload_date': '20170613',
1081 'uploader_id': 'ElevageOrVert',
1082 'uploader': 'ElevageOrVert',
1083 },
1084 'params': {
1085 'skip_download': True,
1086 },
1087 },
1088 {
1089 # with '};' inside yt initial data (see [1])
1090 # see [2] for an example with '};' inside ytInitialPlayerResponse
1091 # 1. https://github.com/ytdl-org/youtube-dl/issues/27093
1092 # 2. https://github.com/ytdl-org/youtube-dl/issues/27216
1093 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
1094 'info_dict': {
1095 'id': 'CHqg6qOn4no',
1096 'ext': 'mp4',
1097 'title': 'Part 77 Sort a list of simple types in c#',
1098 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
1099 'upload_date': '20130831',
1100 'uploader_id': 'kudvenkat',
1101 'uploader': 'kudvenkat',
1102 },
1103 'params': {
1104 'skip_download': True,
1105 },
1106 },
1107 {
1108 # another example of '};' in ytInitialData
1109 'url': 'https://www.youtube.com/watch?v=gVfgbahppCY',
1110 'only_matching': True,
1111 },
1112 {
1113 'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
1114 'only_matching': True,
1115 },
1116 ]
1117
1118 def __init__(self, *args, **kwargs):
1119 super(YoutubeIE, self).__init__(*args, **kwargs)
1120 self._player_cache = {}
1121
1122 def report_video_info_webpage_download(self, video_id):
1123 """Report attempt to download video info webpage."""
1124 self.to_screen('%s: Downloading video info webpage' % video_id)
1125
1126 def report_information_extraction(self, video_id):
1127 """Report attempt to extract video information."""
1128 self.to_screen('%s: Extracting video information' % video_id)
1129
1130 def report_unavailable_format(self, video_id, format):
1131 """Report extracted video URL."""
1132 self.to_screen('%s: Format %s not available' % (video_id, format))
1133
1134 def report_rtmp_download(self):
1135 """Indicate the download will use the RTMP protocol."""
1136 self.to_screen('RTMP download detected')
1137
1138 def _signature_cache_id(self, example_sig):
1139 """ Return a string representation of a signature """
1140 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1141
1142 @classmethod
1143 def _extract_player_info(cls, player_url):
1144 for player_re in cls._PLAYER_INFO_RE:
1145 id_m = re.search(player_re, player_url)
1146 if id_m:
1147 break
1148 else:
1149 raise ExtractorError('Cannot identify player %r' % player_url)
1150 return id_m.group('ext'), id_m.group('id')
1151
1152 def _extract_signature_function(self, video_id, player_url, example_sig):
1153 player_type, player_id = self._extract_player_info(player_url)
1154
1155 # Read from filesystem cache
1156 func_id = '%s_%s_%s' % (
1157 player_type, player_id, self._signature_cache_id(example_sig))
1158 assert os.path.basename(func_id) == func_id
1159
1160 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1161 if cache_spec is not None:
1162 return lambda s: ''.join(s[i] for i in cache_spec)
1163
1164 download_note = (
1165 'Downloading player %s' % player_url
1166 if self._downloader.params.get('verbose') else
1167 'Downloading %s player %s' % (player_type, player_id)
1168 )
1169 if player_type == 'js':
1170 code = self._download_webpage(
1171 player_url, video_id,
1172 note=download_note,
1173 errnote='Download of %s failed' % player_url)
1174 res = self._parse_sig_js(code)
1175 elif player_type == 'swf':
1176 urlh = self._request_webpage(
1177 player_url, video_id,
1178 note=download_note,
1179 errnote='Download of %s failed' % player_url)
1180 code = urlh.read()
1181 res = self._parse_sig_swf(code)
1182 else:
1183 assert False, 'Invalid player type %r' % player_type
1184
1185 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1186 cache_res = res(test_string)
1187 cache_spec = [ord(c) for c in cache_res]
1188
1189 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1190 return res
1191
1192 def _print_sig_code(self, func, example_sig):
1193 def gen_sig_code(idxs):
1194 def _genslice(start, end, step):
1195 starts = '' if start == 0 else str(start)
1196 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1197 steps = '' if step == 1 else (':%d' % step)
1198 return 's[%s%s%s]' % (starts, ends, steps)
1199
1200 step = None
1201 # Quelch pyflakes warnings - start will be set when step is set
1202 start = '(Never used)'
1203 for i, prev in zip(idxs[1:], idxs[:-1]):
1204 if step is not None:
1205 if i - prev == step:
1206 continue
1207 yield _genslice(start, prev, step)
1208 step = None
1209 continue
1210 if i - prev in [-1, 1]:
1211 step = i - prev
1212 start = prev
1213 continue
1214 else:
1215 yield 's[%d]' % prev
1216 if step is None:
1217 yield 's[%d]' % i
1218 else:
1219 yield _genslice(start, i, step)
1220
1221 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1222 cache_res = func(test_string)
1223 cache_spec = [ord(c) for c in cache_res]
1224 expr_code = ' + '.join(gen_sig_code(cache_spec))
1225 signature_id_tuple = '(%s)' % (
1226 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1227 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1228 ' return %s\n') % (signature_id_tuple, expr_code)
1229 self.to_screen('Extracted signature function:\n' + code)
1230
1231 def _parse_sig_js(self, jscode):
1232 funcname = self._search_regex(
1233 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1234 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1235 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1236 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1237 # Obsolete patterns
1238 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1239 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1240 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1241 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1242 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1243 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1244 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1245 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1246 jscode, 'Initial JS player signature function name', group='sig')
1247
1248 jsi = JSInterpreter(jscode)
1249 initial_function = jsi.extract_function(funcname)
1250 return lambda s: initial_function([s])
1251
1252 def _parse_sig_swf(self, file_contents):
1253 swfi = SWFInterpreter(file_contents)
1254 TARGET_CLASSNAME = 'SignatureDecipher'
1255 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1256 initial_function = swfi.extract_function(searched_class, 'decipher')
1257 return lambda s: initial_function([s])
1258
1259 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1260 """Turn the encrypted s field into a working signature"""
1261
1262 if player_url is None:
1263 raise ExtractorError('Cannot decrypt signature without player_url')
1264
1265 if player_url.startswith('//'):
1266 player_url = 'https:' + player_url
1267 elif not re.match(r'https?://', player_url):
1268 player_url = compat_urlparse.urljoin(
1269 'https://www.youtube.com', player_url)
1270 try:
1271 player_id = (player_url, self._signature_cache_id(s))
1272 if player_id not in self._player_cache:
1273 func = self._extract_signature_function(
1274 video_id, player_url, s
1275 )
1276 self._player_cache[player_id] = func
1277 func = self._player_cache[player_id]
1278 if self._downloader.params.get('youtube_print_sig_code'):
1279 self._print_sig_code(func, s)
1280 return func(s)
1281 except Exception as e:
1282 tb = traceback.format_exc()
1283 raise ExtractorError(
1284 'Signature extraction failed: ' + tb, cause=e)
1285
1286 def _get_subtitles(self, video_id, webpage):
1287 try:
1288 subs_doc = self._download_xml(
1289 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1290 video_id, note=False)
1291 except ExtractorError as err:
1292 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1293 return {}
1294
1295 sub_lang_list = {}
1296 for track in subs_doc.findall('track'):
1297 lang = track.attrib['lang_code']
1298 if lang in sub_lang_list:
1299 continue
1300 sub_formats = []
1301 for ext in self._SUBTITLE_FORMATS:
1302 params = compat_urllib_parse_urlencode({
1303 'lang': lang,
1304 'v': video_id,
1305 'fmt': ext,
1306 'name': track.attrib['name'].encode('utf-8'),
1307 })
1308 sub_formats.append({
1309 'url': 'https://www.youtube.com/api/timedtext?' + params,
1310 'ext': ext,
1311 })
1312 sub_lang_list[lang] = sub_formats
1313 if not sub_lang_list:
1314 self._downloader.report_warning('video doesn\'t have subtitles')
1315 return {}
1316 return sub_lang_list
1317
1318 def _get_ytplayer_config(self, video_id, webpage):
1319 patterns = (
1320 # User data may contain arbitrary character sequences that may affect
1321 # JSON extraction with regex, e.g. when '};' is contained the second
1322 # regex won't capture the whole JSON. Yet working around by trying more
1323 # concrete regex first keeping in mind proper quoted string handling
1324 # to be implemented in future that will replace this workaround (see
1325 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1326 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1327 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1328 r';ytplayer\.config\s*=\s*({.+?});',
1329 )
1330 config = self._search_regex(
1331 patterns, webpage, 'ytplayer.config', default=None)
1332 if config:
1333 return self._parse_json(
1334 uppercase_escape(config), video_id, fatal=False)
1335
1336 def _get_automatic_captions(self, video_id, player_response, player_config):
1337 """We need the webpage for getting the captions url, pass it as an
1338 argument to speed up the process."""
1339 self.to_screen('%s: Looking for automatic captions' % video_id)
1340 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1341 if not (player_response or player_config):
1342 self._downloader.report_warning(err_msg)
1343 return {}
1344 try:
1345 args = player_config.get('args') if player_config else {}
1346 caption_url = args.get('ttsurl')
1347 if caption_url:
1348 timestamp = args['timestamp']
1349 # We get the available subtitles
1350 list_params = compat_urllib_parse_urlencode({
1351 'type': 'list',
1352 'tlangs': 1,
1353 'asrs': 1,
1354 })
1355 list_url = caption_url + '&' + list_params
1356 caption_list = self._download_xml(list_url, video_id)
1357 original_lang_node = caption_list.find('track')
1358 if original_lang_node is None:
1359 self._downloader.report_warning('Video doesn\'t have automatic captions')
1360 return {}
1361 original_lang = original_lang_node.attrib['lang_code']
1362 caption_kind = original_lang_node.attrib.get('kind', '')
1363
1364 sub_lang_list = {}
1365 for lang_node in caption_list.findall('target'):
1366 sub_lang = lang_node.attrib['lang_code']
1367 sub_formats = []
1368 for ext in self._SUBTITLE_FORMATS:
1369 params = compat_urllib_parse_urlencode({
1370 'lang': original_lang,
1371 'tlang': sub_lang,
1372 'fmt': ext,
1373 'ts': timestamp,
1374 'kind': caption_kind,
1375 })
1376 sub_formats.append({
1377 'url': caption_url + '&' + params,
1378 'ext': ext,
1379 })
1380 sub_lang_list[sub_lang] = sub_formats
1381 return sub_lang_list
1382
1383 def make_captions(sub_url, sub_langs):
1384 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1385 caption_qs = compat_parse_qs(parsed_sub_url.query)
1386 captions = {}
1387 for sub_lang in sub_langs:
1388 sub_formats = []
1389 for ext in self._SUBTITLE_FORMATS:
1390 caption_qs.update({
1391 'tlang': [sub_lang],
1392 'fmt': [ext],
1393 })
1394 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1395 query=compat_urllib_parse_urlencode(caption_qs, True)))
1396 sub_formats.append({
1397 'url': sub_url,
1398 'ext': ext,
1399 })
1400 captions[sub_lang] = sub_formats
1401 return captions
1402
1403 # New captions format as of 22.06.2017
1404 if player_response:
1405 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1406 base_url = renderer['captionTracks'][0]['baseUrl']
1407 sub_lang_list = []
1408 for lang in renderer['translationLanguages']:
1409 lang_code = lang.get('languageCode')
1410 if lang_code:
1411 sub_lang_list.append(lang_code)
1412 return make_captions(base_url, sub_lang_list)
1413
1414 # Some videos don't provide ttsurl but rather caption_tracks and
1415 # caption_translation_languages (e.g. 20LmZk1hakA)
1416 # Does not used anymore as of 22.06.2017
1417 caption_tracks = args['caption_tracks']
1418 caption_translation_languages = args['caption_translation_languages']
1419 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1420 sub_lang_list = []
1421 for lang in caption_translation_languages.split(','):
1422 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1423 sub_lang = lang_qs.get('lc', [None])[0]
1424 if sub_lang:
1425 sub_lang_list.append(sub_lang)
1426 return make_captions(caption_url, sub_lang_list)
1427 # An extractor error can be raise by the download process if there are
1428 # no automatic captions but there are subtitles
1429 except (KeyError, IndexError, ExtractorError):
1430 self._downloader.report_warning(err_msg)
1431 return {}
1432
1433 def _mark_watched(self, video_id, video_info, player_response):
1434 playback_url = url_or_none(try_get(
1435 player_response,
1436 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1437 video_info, lambda x: x['videostats_playback_base_url'][0]))
1438 if not playback_url:
1439 return
1440 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1441 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1442
1443 # cpn generation algorithm is reverse engineered from base.js.
1444 # In fact it works even with dummy cpn.
1445 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1446 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1447
1448 qs.update({
1449 'ver': ['2'],
1450 'cpn': [cpn],
1451 })
1452 playback_url = compat_urlparse.urlunparse(
1453 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1454
1455 self._download_webpage(
1456 playback_url, video_id, 'Marking watched',
1457 'Unable to mark watched', fatal=False)
1458
1459 @staticmethod
1460 def _extract_urls(webpage):
1461 # Embedded YouTube player
1462 entries = [
1463 unescapeHTML(mobj.group('url'))
1464 for mobj in re.finditer(r'''(?x)
1465 (?:
1466 <iframe[^>]+?src=|
1467 data-video-url=|
1468 <embed[^>]+?src=|
1469 embedSWF\(?:\s*|
1470 <object[^>]+data=|
1471 new\s+SWFObject\(
1472 )
1473 (["\'])
1474 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1475 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1476 \1''', webpage)]
1477
1478 # lazyYT YouTube embed
1479 entries.extend(list(map(
1480 unescapeHTML,
1481 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1482
1483 # Wordpress "YouTube Video Importer" plugin
1484 matches = re.findall(r'''(?x)<div[^>]+
1485 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1486 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1487 entries.extend(m[-1] for m in matches)
1488
1489 return entries
1490
1491 @staticmethod
1492 def _extract_url(webpage):
1493 urls = YoutubeIE._extract_urls(webpage)
1494 return urls[0] if urls else None
1495
1496 @classmethod
1497 def extract_id(cls, url):
1498 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1499 if mobj is None:
1500 raise ExtractorError('Invalid URL: %s' % url)
1501 video_id = mobj.group(2)
1502 return video_id
1503
1504 def _extract_chapters_from_json(self, webpage, video_id, duration):
1505 if not webpage:
1506 return
1507 data = self._extract_yt_initial_data(video_id, webpage)
1508 if not data or not isinstance(data, dict):
1509 return
1510 chapters_list = try_get(
1511 data,
1512 lambda x: x['playerOverlays']
1513 ['playerOverlayRenderer']
1514 ['decoratedPlayerBarRenderer']
1515 ['decoratedPlayerBarRenderer']
1516 ['playerBar']
1517 ['chapteredPlayerBarRenderer']
1518 ['chapters'],
1519 list)
1520 if not chapters_list:
1521 return
1522
1523 def chapter_time(chapter):
1524 return float_or_none(
1525 try_get(
1526 chapter,
1527 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1528 int),
1529 scale=1000)
1530 chapters = []
1531 for next_num, chapter in enumerate(chapters_list, start=1):
1532 start_time = chapter_time(chapter)
1533 if start_time is None:
1534 continue
1535 end_time = (chapter_time(chapters_list[next_num])
1536 if next_num < len(chapters_list) else duration)
1537 if end_time is None:
1538 continue
1539 title = try_get(
1540 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1541 compat_str)
1542 chapters.append({
1543 'start_time': start_time,
1544 'end_time': end_time,
1545 'title': title,
1546 })
1547 return chapters
1548
1549 @staticmethod
1550 def _extract_chapters_from_description(description, duration):
1551 if not description:
1552 return None
1553 chapter_lines = re.findall(
1554 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1555 description)
1556 if not chapter_lines:
1557 return None
1558 chapters = []
1559 for next_num, (chapter_line, time_point) in enumerate(
1560 chapter_lines, start=1):
1561 start_time = parse_duration(time_point)
1562 if start_time is None:
1563 continue
1564 if start_time > duration:
1565 break
1566 end_time = (duration if next_num == len(chapter_lines)
1567 else parse_duration(chapter_lines[next_num][1]))
1568 if end_time is None:
1569 continue
1570 if end_time > duration:
1571 end_time = duration
1572 if start_time > end_time:
1573 break
1574 chapter_title = re.sub(
1575 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1576 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1577 chapters.append({
1578 'start_time': start_time,
1579 'end_time': end_time,
1580 'title': chapter_title,
1581 })
1582 return chapters
1583
1584 def _extract_chapters(self, webpage, description, video_id, duration):
1585 return (self._extract_chapters_from_json(webpage, video_id, duration)
1586 or self._extract_chapters_from_description(description, duration))
1587
1588 def _real_extract(self, url):
1589 url, smuggled_data = unsmuggle_url(url, {})
1590
1591 proto = (
1592 'http' if self._downloader.params.get('prefer_insecure', False)
1593 else 'https')
1594
1595 start_time = None
1596 end_time = None
1597 parsed_url = compat_urllib_parse_urlparse(url)
1598 for component in [parsed_url.fragment, parsed_url.query]:
1599 query = compat_parse_qs(component)
1600 if start_time is None and 't' in query:
1601 start_time = parse_duration(query['t'][0])
1602 if start_time is None and 'start' in query:
1603 start_time = parse_duration(query['start'][0])
1604 if end_time is None and 'end' in query:
1605 end_time = parse_duration(query['end'][0])
1606
1607 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1608 mobj = re.search(self._NEXT_URL_RE, url)
1609 if mobj:
1610 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1611 video_id = self.extract_id(url)
1612
1613 # Check url is youtube music
1614 is_music = 1 #re.match(r'^https?:\/\/music\.youtube\.com\/.+', url) is not None
1615
1616 # Get video webpage
1617 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1618 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1619
1620 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1621 video_id = qs.get('v', [None])[0] or video_id
1622
1623 # Attempt to extract SWF player URL
1624 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1625 if mobj is not None:
1626 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1627 else:
1628 player_url = None
1629
1630 dash_mpds = []
1631
1632 def add_dash_mpd(video_info):
1633 dash_mpd = video_info.get('dashmpd')
1634 if dash_mpd and dash_mpd[0] not in dash_mpds:
1635 dash_mpds.append(dash_mpd[0])
1636
1637 def add_dash_mpd_pr(pl_response):
1638 dash_mpd = url_or_none(try_get(
1639 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1640 compat_str))
1641 if dash_mpd and dash_mpd not in dash_mpds:
1642 dash_mpds.append(dash_mpd)
1643
1644 is_live = None
1645 view_count = None
1646
1647 def extract_view_count(v_info):
1648 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1649
1650 def extract_player_response(player_response, video_id):
1651 pl_response = str_or_none(player_response)
1652 if not pl_response:
1653 return
1654 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1655 if isinstance(pl_response, dict):
1656 add_dash_mpd_pr(pl_response)
1657 return pl_response
1658
1659 player_response = {}
1660
1661 # Get video info
1662 video_info = {}
1663 embed_webpage = None
1664 ytplayer_config = None
1665
1666 # Youtube music should be parsed from get_video_info
1667 # instead of youtube for 256kbps aac codec
1668 if is_music or re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
1669 age_gate = True
1670 # We simulate the access to the video from www.youtube.com/v/{video_id}
1671 # this can be viewed without login into Youtube
1672 url = proto + '://www.youtube.com/embed/%s' % video_id
1673 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1674
1675 if is_music:
1676 # el, c, cver, cplayer field required for 141(aac 256kbps) codec
1677 # maybe paramter of youtube music player?
1678 data = compat_urllib_parse_urlencode({
1679 'video_id': video_id,
1680 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1681 'el': 'detailpage',
1682 'c': 'WEB_REMIX',
1683 'cver': '0.1',
1684 'cplayer': 'UNIPLAYER',
1685 'sts': self._search_regex(
1686 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1687 })
1688 else:
1689 # Remove youtube music parameter for normal video
1690 data = compat_urllib_parse_urlencode({
1691 'video_id': video_id,
1692 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1693 'sts': self._search_regex(
1694 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1695 })
1696 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1697 try:
1698 video_info_webpage = self._download_webpage(
1699 video_info_url, video_id,
1700 note='Refetching age-gated info webpage',
1701 errnote='unable to download video info webpage')
1702 except ExtractorError:
1703 video_info_webpage = None
1704 if video_info_webpage:
1705 video_info = compat_parse_qs(video_info_webpage)
1706 pl_response = video_info.get('player_response', [None])[0]
1707 player_response = extract_player_response(pl_response, video_id)
1708 add_dash_mpd(video_info)
1709 view_count = extract_view_count(video_info)
1710 else:
1711 age_gate = False
1712 # Try looking directly into the video webpage
1713 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1714 if ytplayer_config:
1715 args = ytplayer_config['args']
1716 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1717 # Convert to the same format returned by compat_parse_qs
1718 video_info = dict((k, [v]) for k, v in args.items())
1719 add_dash_mpd(video_info)
1720 # Rental video is not rented but preview is available (e.g.
1721 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1722 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1723 if not video_info and args.get('ypc_vid'):
1724 return self.url_result(
1725 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1726 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1727 is_live = True
1728 if not player_response:
1729 player_response = extract_player_response(args.get('player_response'), video_id)
1730 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1731 add_dash_mpd_pr(player_response)
1732
1733 if not video_info and not player_response:
1734 player_response = extract_player_response(
1735 self._search_regex(
1736 (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
1737 self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
1738 'initial player response', default='{}'),
1739 video_id)
1740
1741 def extract_unavailable_message():
1742 messages = []
1743 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1744 msg = self._html_search_regex(
1745 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1746 video_webpage, 'unavailable %s' % kind, default=None)
1747 if msg:
1748 messages.append(msg)
1749 if messages:
1750 return '\n'.join(messages)
1751
1752 if not video_info and not player_response:
1753 unavailable_message = extract_unavailable_message()
1754 if not unavailable_message:
1755 unavailable_message = 'Unable to extract video data'
1756 raise ExtractorError(
1757 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1758
1759 if not isinstance(video_info, dict):
1760 video_info = {}
1761
1762 video_details = try_get(
1763 player_response, lambda x: x['videoDetails'], dict) or {}
1764
1765 microformat = try_get(
1766 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1767
1768 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1769 if not video_title:
1770 self._downloader.report_warning('Unable to extract video title')
1771 video_title = '_'
1772
1773 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1774 if video_description:
1775
1776 def replace_url(m):
1777 redir_url = compat_urlparse.urljoin(url, m.group(1))
1778 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1779 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1780 qs = compat_parse_qs(parsed_redir_url.query)
1781 q = qs.get('q')
1782 if q and q[0]:
1783 return q[0]
1784 return redir_url
1785
1786 description_original = video_description = re.sub(r'''(?x)
1787 <a\s+
1788 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1789 (?:title|href)="([^"]+)"\s+
1790 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1791 class="[^"]*"[^>]*>
1792 [^<]+\.{3}\s*
1793 </a>
1794 ''', replace_url, video_description)
1795 video_description = clean_html(video_description)
1796 else:
1797 video_description = video_details.get('shortDescription')
1798 if video_description is None:
1799 video_description = self._html_search_meta('description', video_webpage)
1800
1801 if not smuggled_data.get('force_singlefeed', False):
1802 if not self._downloader.params.get('noplaylist'):
1803 multifeed_metadata_list = try_get(
1804 player_response,
1805 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1806 compat_str) or try_get(
1807 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1808 if multifeed_metadata_list:
1809 entries = []
1810 feed_ids = []
1811 for feed in multifeed_metadata_list.split(','):
1812 # Unquote should take place before split on comma (,) since textual
1813 # fields may contain comma as well (see
1814 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1815 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1816
1817 def feed_entry(name):
1818 return try_get(feed_data, lambda x: x[name][0], compat_str)
1819
1820 feed_id = feed_entry('id')
1821 if not feed_id:
1822 continue
1823 feed_title = feed_entry('title')
1824 title = video_title
1825 if feed_title:
1826 title += ' (%s)' % feed_title
1827 entries.append({
1828 '_type': 'url_transparent',
1829 'ie_key': 'Youtube',
1830 'url': smuggle_url(
1831 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1832 {'force_singlefeed': True}),
1833 'title': title,
1834 })
1835 feed_ids.append(feed_id)
1836 self.to_screen(
1837 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1838 % (', '.join(feed_ids), video_id))
1839 return self.playlist_result(entries, video_id, video_title, video_description)
1840 else:
1841 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1842
1843 if view_count is None:
1844 view_count = extract_view_count(video_info)
1845 if view_count is None and video_details:
1846 view_count = int_or_none(video_details.get('viewCount'))
1847 if view_count is None and microformat:
1848 view_count = int_or_none(microformat.get('viewCount'))
1849
1850 if is_live is None:
1851 is_live = bool_or_none(video_details.get('isLive'))
1852
1853 # Check for "rental" videos
1854 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1855 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
1856
1857 def _extract_filesize(media_url):
1858 return int_or_none(self._search_regex(
1859 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
1860
1861 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
1862 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
1863
1864 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1865 self.report_rtmp_download()
1866 formats = [{
1867 'format_id': '_rtmp',
1868 'protocol': 'rtmp',
1869 'url': video_info['conn'][0],
1870 'player_url': player_url,
1871 }]
1872 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
1873 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
1874 if 'rtmpe%3Dyes' in encoded_url_map:
1875 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
1876 formats = []
1877 formats_spec = {}
1878 fmt_list = video_info.get('fmt_list', [''])[0]
1879 if fmt_list:
1880 for fmt in fmt_list.split(','):
1881 spec = fmt.split('/')
1882 if len(spec) > 1:
1883 width_height = spec[1].split('x')
1884 if len(width_height) == 2:
1885 formats_spec[spec[0]] = {
1886 'resolution': spec[1],
1887 'width': int_or_none(width_height[0]),
1888 'height': int_or_none(width_height[1]),
1889 }
1890 for fmt in streaming_formats:
1891 itag = str_or_none(fmt.get('itag'))
1892 if not itag:
1893 continue
1894 quality = fmt.get('quality')
1895 quality_label = fmt.get('qualityLabel') or quality
1896 formats_spec[itag] = {
1897 'asr': int_or_none(fmt.get('audioSampleRate')),
1898 'filesize': int_or_none(fmt.get('contentLength')),
1899 'format_note': quality_label,
1900 'fps': int_or_none(fmt.get('fps')),
1901 'height': int_or_none(fmt.get('height')),
1902 # bitrate for itag 43 is always 2147483647
1903 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
1904 'width': int_or_none(fmt.get('width')),
1905 }
1906
1907 for fmt in streaming_formats:
1908 if fmt.get('drmFamilies') or fmt.get('drm_families'):
1909 continue
1910 url = url_or_none(fmt.get('url'))
1911
1912 if not url:
1913 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
1914 if not cipher:
1915 continue
1916 url_data = compat_parse_qs(cipher)
1917 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
1918 if not url:
1919 continue
1920 else:
1921 cipher = None
1922 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
1923
1924 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
1925 # Unsupported FORMAT_STREAM_TYPE_OTF
1926 if stream_type == 3:
1927 continue
1928
1929 format_id = fmt.get('itag') or url_data['itag'][0]
1930 if not format_id:
1931 continue
1932 format_id = compat_str(format_id)
1933
1934 if cipher:
1935 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
1936 ASSETS_RE = (
1937 r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
1938 r'"jsUrl"\s*:\s*("[^"]+")',
1939 r'"assets":.+?"js":\s*("[^"]+")')
1940 jsplayer_url_json = self._search_regex(
1941 ASSETS_RE,
1942 embed_webpage if age_gate else video_webpage,
1943 'JS player URL (1)', default=None)
1944 if not jsplayer_url_json and not age_gate:
1945 # We need the embed website after all
1946 if embed_webpage is None:
1947 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
1948 embed_webpage = self._download_webpage(
1949 embed_url, video_id, 'Downloading embed webpage')
1950 jsplayer_url_json = self._search_regex(
1951 ASSETS_RE, embed_webpage, 'JS player URL')
1952
1953 player_url = json.loads(jsplayer_url_json)
1954 if player_url is None:
1955 player_url_json = self._search_regex(
1956 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
1957 video_webpage, 'age gate player URL')
1958 player_url = json.loads(player_url_json)
1959
1960 if 'sig' in url_data:
1961 url += '&signature=' + url_data['sig'][0]
1962 elif 's' in url_data:
1963 encrypted_sig = url_data['s'][0]
1964
1965 if self._downloader.params.get('verbose'):
1966 if player_url is None:
1967 player_desc = 'unknown'
1968 else:
1969 player_type, player_version = self._extract_player_info(player_url)
1970 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
1971 parts_sizes = self._signature_cache_id(encrypted_sig)
1972 self.to_screen('{%s} signature length %s, %s' %
1973 (format_id, parts_sizes, player_desc))
1974
1975 signature = self._decrypt_signature(
1976 encrypted_sig, video_id, player_url, age_gate)
1977 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
1978 url += '&%s=%s' % (sp, signature)
1979 if 'ratebypass' not in url:
1980 url += '&ratebypass=yes'
1981
1982 dct = {
1983 'format_id': format_id,
1984 'url': url,
1985 'player_url': player_url,
1986 }
1987 if format_id in self._formats:
1988 dct.update(self._formats[format_id])
1989 if format_id in formats_spec:
1990 dct.update(formats_spec[format_id])
1991
1992 # Some itags are not included in DASH manifest thus corresponding formats will
1993 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
1994 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
1995 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
1996 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
1997
1998 if width is None:
1999 width = int_or_none(fmt.get('width'))
2000 if height is None:
2001 height = int_or_none(fmt.get('height'))
2002
2003 filesize = int_or_none(url_data.get(
2004 'clen', [None])[0]) or _extract_filesize(url)
2005
2006 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2007 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2008
2009 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2010 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2011 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2012
2013 more_fields = {
2014 'filesize': filesize,
2015 'tbr': tbr,
2016 'width': width,
2017 'height': height,
2018 'fps': fps,
2019 'format_note': quality_label or quality,
2020 }
2021 for key, value in more_fields.items():
2022 if value:
2023 dct[key] = value
2024 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2025 if type_:
2026 type_split = type_.split(';')
2027 kind_ext = type_split[0].split('/')
2028 if len(kind_ext) == 2:
2029 kind, _ = kind_ext
2030 dct['ext'] = mimetype2ext(type_split[0])
2031 if kind in ('audio', 'video'):
2032 codecs = None
2033 for mobj in re.finditer(
2034 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2035 if mobj.group('key') == 'codecs':
2036 codecs = mobj.group('val')
2037 break
2038 if codecs:
2039 dct.update(parse_codecs(codecs))
2040 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2041 dct['downloader_options'] = {
2042 # Youtube throttles chunks >~10M
2043 'http_chunk_size': 10485760,
2044 }
2045 formats.append(dct)
2046 else:
2047 manifest_url = (
2048 url_or_none(try_get(
2049 player_response,
2050 lambda x: x['streamingData']['hlsManifestUrl'],
2051 compat_str))
2052 or url_or_none(try_get(
2053 video_info, lambda x: x['hlsvp'][0], compat_str)))
2054 if manifest_url:
2055 formats = []
2056 m3u8_formats = self._extract_m3u8_formats(
2057 manifest_url, video_id, 'mp4', fatal=False)
2058 for a_format in m3u8_formats:
2059 itag = self._search_regex(
2060 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2061 if itag:
2062 a_format['format_id'] = itag
2063 if itag in self._formats:
2064 dct = self._formats[itag].copy()
2065 dct.update(a_format)
2066 a_format = dct
2067 a_format['player_url'] = player_url
2068 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2069 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2070 formats.append(a_format)
2071 else:
2072 error_message = extract_unavailable_message()
2073 if not error_message:
2074 reason_list = try_get(
2075 player_response,
2076 lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
2077 list) or []
2078 for reason in reason_list:
2079 if not isinstance(reason, dict):
2080 continue
2081 reason_text = try_get(reason, lambda x: x['text'], compat_str)
2082 if reason_text:
2083 if not error_message:
2084 error_message = ''
2085 error_message += reason_text
2086 if error_message:
2087 error_message = clean_html(error_message)
2088 if not error_message:
2089 error_message = clean_html(try_get(
2090 player_response, lambda x: x['playabilityStatus']['reason'],
2091 compat_str))
2092 if not error_message:
2093 error_message = clean_html(
2094 try_get(video_info, lambda x: x['reason'][0], compat_str))
2095 if error_message:
2096 raise ExtractorError(error_message, expected=True)
2097 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2098
2099 # uploader
2100 video_uploader = try_get(
2101 video_info, lambda x: x['author'][0],
2102 compat_str) or str_or_none(video_details.get('author'))
2103 if video_uploader:
2104 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2105 else:
2106 self._downloader.report_warning('unable to extract uploader name')
2107
2108 # uploader_id
2109 video_uploader_id = None
2110 video_uploader_url = None
2111 mobj = re.search(
2112 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2113 video_webpage)
2114 if mobj is not None:
2115 video_uploader_id = mobj.group('uploader_id')
2116 video_uploader_url = mobj.group('uploader_url')
2117 else:
2118 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2119 if owner_profile_url:
2120 video_uploader_id = self._search_regex(
2121 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2122 default=None)
2123 video_uploader_url = owner_profile_url
2124
2125 channel_id = (
2126 str_or_none(video_details.get('channelId'))
2127 or self._html_search_meta(
2128 'channelId', video_webpage, 'channel id', default=None)
2129 or self._search_regex(
2130 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2131 video_webpage, 'channel id', default=None, group='id'))
2132 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2133
2134 thumbnails = []
2135 thumbnails_list = try_get(
2136 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2137 for t in thumbnails_list:
2138 if not isinstance(t, dict):
2139 continue
2140 thumbnail_url = url_or_none(t.get('url'))
2141 if not thumbnail_url:
2142 continue
2143 thumbnails.append({
2144 'url': thumbnail_url,
2145 'width': int_or_none(t.get('width')),
2146 'height': int_or_none(t.get('height')),
2147 })
2148
2149 if not thumbnails:
2150 video_thumbnail = None
2151 # We try first to get a high quality image:
2152 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2153 video_webpage, re.DOTALL)
2154 if m_thumb is not None:
2155 video_thumbnail = m_thumb.group(1)
2156 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2157 if thumbnail_url:
2158 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2159 if video_thumbnail:
2160 thumbnails.append({'url': video_thumbnail})
2161
2162 # upload date
2163 upload_date = self._html_search_meta(
2164 'datePublished', video_webpage, 'upload date', default=None)
2165 if not upload_date:
2166 upload_date = self._search_regex(
2167 [r'(?s)id="eow-date.*?>(.*?)</span>',
2168 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2169 video_webpage, 'upload date', default=None)
2170 if not upload_date:
2171 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2172 upload_date = unified_strdate(upload_date)
2173
2174 video_license = self._html_search_regex(
2175 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2176 video_webpage, 'license', default=None)
2177
2178 m_music = re.search(
2179 r'''(?x)
2180 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2181 <ul[^>]*>\s*
2182 <li>(?P<title>.+?)
2183 by (?P<creator>.+?)
2184 (?:
2185 \(.+?\)|
2186 <a[^>]*
2187 (?:
2188 \bhref=["\']/red[^>]*>| # drop possible
2189 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2190 )
2191 .*?
2192 )?</li
2193 ''',
2194 video_webpage)
2195 if m_music:
2196 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2197 video_creator = clean_html(m_music.group('creator'))
2198 else:
2199 video_alt_title = video_creator = None
2200
2201 def extract_meta(field):
2202 return self._html_search_regex(
2203 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2204 video_webpage, field, default=None)
2205
2206 track = extract_meta('Song')
2207 artist = extract_meta('Artist')
2208 album = extract_meta('Album')
2209
2210 # Youtube Music Auto-generated description
2211 release_date = release_year = None
2212 if video_description:
2213 mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
2214 if mobj:
2215 if not track:
2216 track = mobj.group('track').strip()
2217 if not artist:
2218 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2219 if not album:
2220 album = mobj.group('album'.strip())
2221 release_year = mobj.group('release_year')
2222 release_date = mobj.group('release_date')
2223 if release_date:
2224 release_date = release_date.replace('-', '')
2225 if not release_year:
2226 release_year = int(release_date[:4])
2227 if release_year:
2228 release_year = int(release_year)
2229
2230 yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
2231 contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
2232 for content in contents:
2233 rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
2234 multiple_songs = False
2235 for row in rows:
2236 if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
2237 multiple_songs = True
2238 break
2239 for row in rows:
2240 mrr = row.get('metadataRowRenderer') or {}
2241 mrr_title = try_get(
2242 mrr, lambda x: x['title']['simpleText'], compat_str)
2243 mrr_contents = try_get(
2244 mrr, lambda x: x['contents'][0], dict) or {}
2245 mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
2246 if not (mrr_title and mrr_contents_text):
2247 continue
2248 if mrr_title == 'License':
2249 video_license = mrr_contents_text
2250 elif not multiple_songs:
2251 if mrr_title == 'Album':
2252 album = mrr_contents_text
2253 elif mrr_title == 'Artist':
2254 artist = mrr_contents_text
2255 elif mrr_title == 'Song':
2256 track = mrr_contents_text
2257
2258 m_episode = re.search(
2259 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2260 video_webpage)
2261 if m_episode:
2262 series = unescapeHTML(m_episode.group('series'))
2263 season_number = int(m_episode.group('season'))
2264 episode_number = int(m_episode.group('episode'))
2265 else:
2266 series = season_number = episode_number = None
2267
2268 m_cat_container = self._search_regex(
2269 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2270 video_webpage, 'categories', default=None)
2271 category = None
2272 if m_cat_container:
2273 category = self._html_search_regex(
2274 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2275 default=None)
2276 if not category:
2277 category = try_get(
2278 microformat, lambda x: x['category'], compat_str)
2279 video_categories = None if category is None else [category]
2280
2281 video_tags = [
2282 unescapeHTML(m.group('content'))
2283 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2284 if not video_tags:
2285 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2286
2287 def _extract_count(count_name):
2288 return str_to_int(self._search_regex(
2289 (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
2290 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
2291 video_webpage, count_name, default=None))
2292
2293 like_count = _extract_count('like')
2294 dislike_count = _extract_count('dislike')
2295
2296 if view_count is None:
2297 view_count = str_to_int(self._search_regex(
2298 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2299 'view count', default=None))
2300
2301 average_rating = (
2302 float_or_none(video_details.get('averageRating'))
2303 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2304
2305 # subtitles
2306 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2307 automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
2308
2309 video_duration = try_get(
2310 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2311 if not video_duration:
2312 video_duration = int_or_none(video_details.get('lengthSeconds'))
2313 if not video_duration:
2314 video_duration = parse_duration(self._html_search_meta(
2315 'duration', video_webpage, 'video duration'))
2316
2317 # annotations
2318 video_annotations = None
2319 if self._downloader.params.get('writeannotations', False):
2320 xsrf_token = None
2321 ytcfg = self._extract_ytcfg(video_id, video_webpage)
2322 if ytcfg:
2323 xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
2324 if not xsrf_token:
2325 xsrf_token = self._search_regex(
2326 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
2327 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2328 invideo_url = try_get(
2329 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2330 if xsrf_token and invideo_url:
2331 xsrf_field_name = None
2332 if ytcfg:
2333 xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
2334 if not xsrf_field_name:
2335 xsrf_field_name = self._search_regex(
2336 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2337 video_webpage, 'xsrf field name',
2338 group='xsrf_field_name', default='session_token')
2339 video_annotations = self._download_webpage(
2340 self._proto_relative_url(invideo_url),
2341 video_id, note='Downloading annotations',
2342 errnote='Unable to download video annotations', fatal=False,
2343 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2344
2345 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2346
2347 # Look for the DASH manifest
2348 if self._downloader.params.get('youtube_include_dash_manifest', True):
2349 dash_mpd_fatal = True
2350 for mpd_url in dash_mpds:
2351 dash_formats = {}
2352 try:
2353 def decrypt_sig(mobj):
2354 s = mobj.group(1)
2355 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2356 return '/signature/%s' % dec_s
2357
2358 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2359
2360 for df in self._extract_mpd_formats(
2361 mpd_url, video_id, fatal=dash_mpd_fatal,
2362 formats_dict=self._formats):
2363 if not df.get('filesize'):
2364 df['filesize'] = _extract_filesize(df['url'])
2365 # Do not overwrite DASH format found in some previous DASH manifest
2366 if df['format_id'] not in dash_formats:
2367 dash_formats[df['format_id']] = df
2368 # Additional DASH manifests may end up in HTTP Error 403 therefore
2369 # allow them to fail without bug report message if we already have
2370 # some DASH manifest succeeded. This is temporary workaround to reduce
2371 # burst of bug reports until we figure out the reason and whether it
2372 # can be fixed at all.
2373 dash_mpd_fatal = False
2374 except (ExtractorError, KeyError) as e:
2375 self.report_warning(
2376 'Skipping DASH manifest: %r' % e, video_id)
2377 if dash_formats:
2378 # Remove the formats we found through non-DASH, they
2379 # contain less info and it can be wrong, because we use
2380 # fixed values (for example the resolution). See
2381 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2382 # example.
2383 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2384 formats.extend(dash_formats.values())
2385
2386 # Check for malformed aspect ratio
2387 stretched_m = re.search(
2388 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2389 video_webpage)
2390 if stretched_m:
2391 w = float(stretched_m.group('w'))
2392 h = float(stretched_m.group('h'))
2393 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2394 # We will only process correct ratios.
2395 if w > 0 and h > 0:
2396 ratio = w / h
2397 for f in formats:
2398 if f.get('vcodec') != 'none':
2399 f['stretched_ratio'] = ratio
2400
2401 if not formats:
2402 if 'reason' in video_info:
2403 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2404 regions_allowed = self._html_search_meta(
2405 'regionsAllowed', video_webpage, default=None)
2406 countries = regions_allowed.split(',') if regions_allowed else None
2407 self.raise_geo_restricted(
2408 msg=video_info['reason'][0], countries=countries)
2409 reason = video_info['reason'][0]
2410 if 'Invalid parameters' in reason:
2411 unavailable_message = extract_unavailable_message()
2412 if unavailable_message:
2413 reason = unavailable_message
2414 raise ExtractorError(
2415 'YouTube said: %s' % reason,
2416 expected=True, video_id=video_id)
2417 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2418 raise ExtractorError('This video is DRM protected.', expected=True)
2419
2420 self._sort_formats(formats)
2421
2422 self.mark_watched(video_id, video_info, player_response)
2423
2424 return {
2425 'id': video_id,
2426 'uploader': video_uploader,
2427 'uploader_id': video_uploader_id,
2428 'uploader_url': video_uploader_url,
2429 'channel_id': channel_id,
2430 'channel_url': channel_url,
2431 'upload_date': upload_date,
2432 'license': video_license,
2433 'creator': video_creator or artist,
2434 'title': video_title,
2435 'alt_title': video_alt_title or track,
2436 'thumbnails': thumbnails,
2437 'description': video_description,
2438 'categories': video_categories,
2439 'tags': video_tags,
2440 'subtitles': video_subtitles,
2441 'automatic_captions': automatic_captions,
2442 'duration': video_duration,
2443 'age_limit': 18 if age_gate else 0,
2444 'annotations': video_annotations,
2445 'chapters': chapters,
2446 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2447 'view_count': view_count,
2448 'like_count': like_count,
2449 'dislike_count': dislike_count,
2450 'average_rating': average_rating,
2451 'formats': formats,
2452 'is_live': is_live,
2453 'start_time': start_time,
2454 'end_time': end_time,
2455 'series': series,
2456 'season_number': season_number,
2457 'episode_number': episode_number,
2458 'track': track,
2459 'artist': artist,
2460 'album': album,
2461 'release_date': release_date,
2462 'release_year': release_year,
2463 }
2464
2465
2466class YoutubeTabIE(YoutubeBaseInfoExtractor):
2467 IE_DESC = 'YouTube.com tab'
2468 _VALID_URL = r'''(?x)
2469 https?://
2470 (?:\w+\.)?
2471 (?:
2472 youtube(?:kids)?\.com|
2473 invidio\.us
2474 )/
2475 (?:
2476 (?:channel|c|user|feed)/|
2477 (?:playlist|watch)\?.*?\blist=|
2478 (?!(?:watch|embed|v|e)\b)
2479 )
2480 (?P<id>[^/?\#&]+)
2481 '''
2482 IE_NAME = 'youtube:tab'
2483
2484 _TESTS = [{
2485 # playlists, multipage
2486 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
2487 'playlist_mincount': 94,
2488 'info_dict': {
2489 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2490 'title': 'Игорь Клейнер - Playlists',
2491 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2492 },
2493 }, {
2494 # playlists, multipage, different order
2495 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
2496 'playlist_mincount': 94,
2497 'info_dict': {
2498 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
2499 'title': 'Игорь Клейнер - Playlists',
2500 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
2501 },
2502 }, {
2503 # playlists, singlepage
2504 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
2505 'playlist_mincount': 4,
2506 'info_dict': {
2507 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
2508 'title': 'ThirstForScience - Playlists',
2509 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
2510 }
2511 }, {
2512 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
2513 'only_matching': True,
2514 }, {
2515 # basic, single video playlist
2516 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2517 'info_dict': {
2518 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2519 'uploader': 'Sergey M.',
2520 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2521 'title': 'youtube-dl public playlist',
2522 },
2523 'playlist_count': 1,
2524 }, {
2525 # empty playlist
2526 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2527 'info_dict': {
2528 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2529 'uploader': 'Sergey M.',
2530 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2531 'title': 'youtube-dl empty playlist',
2532 },
2533 'playlist_count': 0,
2534 }, {
2535 # Home tab
2536 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
2537 'info_dict': {
2538 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2539 'title': 'lex will - Home',
2540 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2541 },
2542 'playlist_mincount': 2,
2543 }, {
2544 # Videos tab
2545 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
2546 'info_dict': {
2547 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2548 'title': 'lex will - Videos',
2549 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2550 },
2551 'playlist_mincount': 975,
2552 }, {
2553 # Videos tab, sorted by popular
2554 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
2555 'info_dict': {
2556 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2557 'title': 'lex will - Videos',
2558 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2559 },
2560 'playlist_mincount': 199,
2561 }, {
2562 # Playlists tab
2563 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
2564 'info_dict': {
2565 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2566 'title': 'lex will - Playlists',
2567 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2568 },
2569 'playlist_mincount': 17,
2570 }, {
2571 # Community tab
2572 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
2573 'info_dict': {
2574 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2575 'title': 'lex will - Community',
2576 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2577 },
2578 'playlist_mincount': 18,
2579 }, {
2580 # Channels tab
2581 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
2582 'info_dict': {
2583 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2584 'title': 'lex will - Channels',
2585 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
2586 },
2587 'playlist_mincount': 138,
2588 }, {
2589 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2590 'only_matching': True,
2591 }, {
2592 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2593 'only_matching': True,
2594 }, {
2595 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
2596 'only_matching': True,
2597 }, {
2598 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2599 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2600 'info_dict': {
2601 'title': '29C3: Not my department',
2602 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2603 'uploader': 'Christiaan008',
2604 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
2605 },
2606 'playlist_count': 96,
2607 }, {
2608 'note': 'Large playlist',
2609 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2610 'info_dict': {
2611 'title': 'Uploads from Cauchemar',
2612 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2613 'uploader': 'Cauchemar',
2614 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
2615 },
2616 'playlist_mincount': 1123,
2617 }, {
2618 # even larger playlist, 8832 videos
2619 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
2620 'only_matching': True,
2621 }, {
2622 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2623 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2624 'info_dict': {
2625 'title': 'Uploads from Interstellar Movie',
2626 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2627 'uploader': 'Interstellar Movie',
2628 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
2629 },
2630 'playlist_mincount': 21,
2631 }, {
2632 # https://github.com/ytdl-org/youtube-dl/issues/21844
2633 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2634 'info_dict': {
2635 'title': 'Data Analysis with Dr Mike Pound',
2636 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2637 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
2638 'uploader': 'Computerphile',
2639 },
2640 'playlist_mincount': 11,
2641 }, {
2642 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2643 'only_matching': True,
2644 }, {
2645 # Playlist URL that does not actually serve a playlist
2646 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2647 'info_dict': {
2648 'id': 'FqZTN594JQw',
2649 'ext': 'webm',
2650 'title': "Smiley's People 01 detective, Adventure Series, Action",
2651 'uploader': 'STREEM',
2652 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2653 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2654 'upload_date': '20150526',
2655 'license': 'Standard YouTube License',
2656 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2657 'categories': ['People & Blogs'],
2658 'tags': list,
2659 'view_count': int,
2660 'like_count': int,
2661 'dislike_count': int,
2662 },
2663 'params': {
2664 'skip_download': True,
2665 },
2666 'skip': 'This video is not available.',
2667 'add_ie': [YoutubeIE.ie_key()],
2668 }, {
2669 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2670 'only_matching': True,
2671 }, {
2672 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
2673 'only_matching': True,
2674 }, {
2675 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
2676 'info_dict': {
2677 'id': '9Auq9mYxFEE',
2678 'ext': 'mp4',
2679 'title': 'Watch Sky News live',
2680 'uploader': 'Sky News',
2681 'uploader_id': 'skynews',
2682 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
2683 'upload_date': '20191102',
2684 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
2685 'categories': ['News & Politics'],
2686 'tags': list,
2687 'like_count': int,
2688 'dislike_count': int,
2689 },
2690 'params': {
2691 'skip_download': True,
2692 },
2693 }, {
2694 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
2695 'info_dict': {
2696 'id': 'a48o2S1cPoo',
2697 'ext': 'mp4',
2698 'title': 'The Young Turks - Live Main Show',
2699 'uploader': 'The Young Turks',
2700 'uploader_id': 'TheYoungTurks',
2701 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
2702 'upload_date': '20150715',
2703 'license': 'Standard YouTube License',
2704 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
2705 'categories': ['News & Politics'],
2706 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
2707 'like_count': int,
2708 'dislike_count': int,
2709 },
2710 'params': {
2711 'skip_download': True,
2712 },
2713 'only_matching': True,
2714 }, {
2715 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
2716 'only_matching': True,
2717 }, {
2718 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
2719 'only_matching': True,
2720 }, {
2721 'url': 'https://www.youtube.com/feed/trending',
2722 'only_matching': True,
2723 }, {
2724 # needs auth
2725 'url': 'https://www.youtube.com/feed/library',
2726 'only_matching': True,
2727 }, {
2728 # needs auth
2729 'url': 'https://www.youtube.com/feed/history',
2730 'only_matching': True,
2731 }, {
2732 # needs auth
2733 'url': 'https://www.youtube.com/feed/subscriptions',
2734 'only_matching': True,
2735 }, {
2736 # needs auth
2737 'url': 'https://www.youtube.com/feed/watch_later',
2738 'only_matching': True,
2739 }, {
2740 # no longer available?
2741 'url': 'https://www.youtube.com/feed/recommended',
2742 'only_matching': True,
2743 }, {
2744 # inline playlist with not always working continuations
2745 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C',
2746 'only_matching': True,
2747 }, {
2748 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8',
2749 'only_matching': True,
2750 }, {
2751 'url': 'https://www.youtube.com/course',
2752 'only_matching': True,
2753 }, {
2754 'url': 'https://www.youtube.com/zsecurity',
2755 'only_matching': True,
2756 }, {
2757 'url': 'http://www.youtube.com/NASAgovVideo/videos',
2758 'only_matching': True,
2759 }, {
2760 'url': 'https://www.youtube.com/TheYoungTurks/live',
2761 'only_matching': True,
2762 }]
2763
2764 @classmethod
2765 def suitable(cls, url):
2766 return False if YoutubeIE.suitable(url) else super(
2767 YoutubeTabIE, cls).suitable(url)
2768
2769 def _extract_channel_id(self, webpage):
2770 channel_id = self._html_search_meta(
2771 'channelId', webpage, 'channel id', default=None)
2772 if channel_id:
2773 return channel_id
2774 channel_url = self._html_search_meta(
2775 ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
2776 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
2777 'twitter:app:url:googleplay'), webpage, 'channel url')
2778 return self._search_regex(
2779 r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
2780 channel_url, 'channel id')
2781
2782 @staticmethod
2783 def _extract_grid_item_renderer(item):
2784 for item_kind in ('Playlist', 'Video', 'Channel'):
2785 renderer = item.get('grid%sRenderer' % item_kind)
2786 if renderer:
2787 return renderer
2788
2789 def _extract_video(self, renderer):
2790 video_id = renderer.get('videoId')
2791 title = try_get(
2792 renderer,
2793 (lambda x: x['title']['runs'][0]['text'],
2794 lambda x: x['title']['simpleText']), compat_str)
2795 description = try_get(
2796 renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
2797 compat_str)
2798 duration = parse_duration(try_get(
2799 renderer, lambda x: x['lengthText']['simpleText'], compat_str))
2800 view_count_text = try_get(
2801 renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
2802 view_count = str_to_int(self._search_regex(
2803 r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
2804 'view count', default=None))
2805 uploader = try_get(
2806 renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
2807 return {
2808 '_type': 'url_transparent',
2809 'ie_key': YoutubeIE.ie_key(),
2810 'id': video_id,
2811 'url': video_id,
2812 'title': title,
2813 'description': description,
2814 'duration': duration,
2815 'view_count': view_count,
2816 'uploader': uploader,
2817 }
2818
2819 def _grid_entries(self, grid_renderer):
2820 for item in grid_renderer['items']:
2821 if not isinstance(item, dict):
2822 continue
2823 renderer = self._extract_grid_item_renderer(item)
2824 if not isinstance(renderer, dict):
2825 continue
2826 title = try_get(
2827 renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2828 # playlist
2829 playlist_id = renderer.get('playlistId')
2830 if playlist_id:
2831 yield self.url_result(
2832 'https://www.youtube.com/playlist?list=%s' % playlist_id,
2833 ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
2834 video_title=title)
2835 # video
2836 video_id = renderer.get('videoId')
2837 if video_id:
2838 yield self._extract_video(renderer)
2839 # channel
2840 channel_id = renderer.get('channelId')
2841 if channel_id:
2842 title = try_get(
2843 renderer, lambda x: x['title']['simpleText'], compat_str)
2844 yield self.url_result(
2845 'https://www.youtube.com/channel/%s' % channel_id,
2846 ie=YoutubeTabIE.ie_key(), video_title=title)
2847
2848 def _shelf_entries_from_content(self, shelf_renderer):
2849 content = shelf_renderer.get('content')
2850 if not isinstance(content, dict):
2851 return
2852 renderer = content.get('gridRenderer')
2853 if renderer:
2854 # TODO: add support for nested playlists so each shelf is processed
2855 # as separate playlist
2856 # TODO: this includes only first N items
2857 for entry in self._grid_entries(renderer):
2858 yield entry
2859 renderer = content.get('horizontalListRenderer')
2860 if renderer:
2861 # TODO
2862 pass
2863
2864 def _shelf_entries(self, shelf_renderer, skip_channels=False):
2865 ep = try_get(
2866 shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
2867 compat_str)
2868 shelf_url = urljoin('https://www.youtube.com', ep)
2869 if shelf_url:
2870 # Skipping links to another channels, note that checking for
2871 # endpoint.commandMetadata.webCommandMetadata.webPageTypwebPageType == WEB_PAGE_TYPE_CHANNEL
2872 # will not work
2873 if skip_channels and '/channels?' in shelf_url:
2874 return
2875 title = try_get(
2876 shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
2877 yield self.url_result(shelf_url, video_title=title)
2878 # Shelf may not contain shelf URL, fallback to extraction from content
2879 for entry in self._shelf_entries_from_content(shelf_renderer):
2880 yield entry
2881
2882 def _playlist_entries(self, video_list_renderer):
2883 for content in video_list_renderer['contents']:
2884 if not isinstance(content, dict):
2885 continue
2886 renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
2887 if not isinstance(renderer, dict):
2888 continue
2889 video_id = renderer.get('videoId')
2890 if not video_id:
2891 continue
2892 yield self._extract_video(renderer)
2893
2894 def _video_entry(self, video_renderer):
2895 video_id = video_renderer.get('videoId')
2896 if video_id:
2897 return self._extract_video(video_renderer)
2898
2899 def _post_thread_entries(self, post_thread_renderer):
2900 post_renderer = try_get(
2901 post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
2902 if not post_renderer:
2903 return
2904 # video attachment
2905 video_renderer = try_get(
2906 post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
2907 video_id = None
2908 if video_renderer:
2909 entry = self._video_entry(video_renderer)
2910 if entry:
2911 yield entry
2912 # inline video links
2913 runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
2914 for run in runs:
2915 if not isinstance(run, dict):
2916 continue
2917 ep_url = try_get(
2918 run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
2919 if not ep_url:
2920 continue
2921 if not YoutubeIE.suitable(ep_url):
2922 continue
2923 ep_video_id = YoutubeIE._match_id(ep_url)
2924 if video_id == ep_video_id:
2925 continue
2926 yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
2927
2928 def _post_thread_continuation_entries(self, post_thread_continuation):
2929 contents = post_thread_continuation.get('contents')
2930 if not isinstance(contents, list):
2931 return
2932 for content in contents:
2933 renderer = content.get('backstagePostThreadRenderer')
2934 if not isinstance(renderer, dict):
2935 continue
2936 for entry in self._post_thread_entries(renderer):
2937 yield entry
2938
2939 @staticmethod
2940 def _build_continuation_query(continuation, ctp=None):
2941 query = {
2942 'ctoken': continuation,
2943 'continuation': continuation,
2944 }
2945 if ctp:
2946 query['itct'] = ctp
2947 return query
2948
2949 @staticmethod
2950 def _extract_next_continuation_data(renderer):
2951 next_continuation = try_get(
2952 renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
2953 if not next_continuation:
2954 return
2955 continuation = next_continuation.get('continuation')
2956 if not continuation:
2957 return
2958 ctp = next_continuation.get('clickTrackingParams')
2959 return YoutubeTabIE._build_continuation_query(continuation, ctp)
2960
2961 @classmethod
2962 def _extract_continuation(cls, renderer):
2963 next_continuation = cls._extract_next_continuation_data(renderer)
2964 if next_continuation:
2965 return next_continuation
2966 contents = renderer.get('contents')
2967 if not isinstance(contents, list):
2968 return
2969 for content in contents:
2970 if not isinstance(content, dict):
2971 continue
2972 continuation_ep = try_get(
2973 content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
2974 dict)
2975 if not continuation_ep:
2976 continue
2977 continuation = try_get(
2978 continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
2979 if not continuation:
2980 continue
2981 ctp = continuation_ep.get('clickTrackingParams')
2982 return YoutubeTabIE._build_continuation_query(continuation, ctp)
2983
2984 def _entries(self, tab, identity_token):
2985 tab_content = try_get(tab, lambda x: x['content'], dict)
2986 if not tab_content:
2987 return
2988 slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
2989 if not slr_renderer:
2990 return
2991 is_channels_tab = tab.get('title') == 'Channels'
2992 continuation = None
2993 slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
2994 for slr_content in slr_contents:
2995 if not isinstance(slr_content, dict):
2996 continue
2997 is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
2998 if not is_renderer:
2999 continue
3000 isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
3001 for isr_content in isr_contents:
3002 if not isinstance(isr_content, dict):
3003 continue
3004 renderer = isr_content.get('playlistVideoListRenderer')
3005 if renderer:
3006 for entry in self._playlist_entries(renderer):
3007 yield entry
3008 continuation = self._extract_continuation(renderer)
3009 continue
3010 renderer = isr_content.get('gridRenderer')
3011 if renderer:
3012 for entry in self._grid_entries(renderer):
3013 yield entry
3014 continuation = self._extract_continuation(renderer)
3015 continue
3016 renderer = isr_content.get('shelfRenderer')
3017 if renderer:
3018 for entry in self._shelf_entries(renderer, not is_channels_tab):
3019 yield entry
3020 continue
3021 renderer = isr_content.get('backstagePostThreadRenderer')
3022 if renderer:
3023 for entry in self._post_thread_entries(renderer):
3024 yield entry
3025 continuation = self._extract_continuation(renderer)
3026 continue
3027 renderer = isr_content.get('videoRenderer')
3028 if renderer:
3029 entry = self._video_entry(renderer)
3030 if entry:
3031 yield entry
3032
3033 if not continuation:
3034 continuation = self._extract_continuation(is_renderer)
3035
3036 if not continuation:
3037 continuation = self._extract_continuation(slr_renderer)
3038
3039 headers = {
3040 'x-youtube-client-name': '1',
3041 'x-youtube-client-version': '2.20201112.04.01',
3042 }
3043 if identity_token:
3044 headers['x-youtube-identity-token'] = identity_token
3045
3046 for page_num in itertools.count(1):
3047 if not continuation:
3048 break
3049 count = 0
3050 retries = 3
3051 while count <= retries:
3052 try:
3053 # Downloading page may result in intermittent 5xx HTTP error
3054 # that is usually worked around with a retry
3055 browse = self._download_json(
3056 'https://www.youtube.com/browse_ajax', None,
3057 'Downloading page %d%s'
3058 % (page_num, ' (retry #%d)' % count if count else ''),
3059 headers=headers, query=continuation)
3060 break
3061 except ExtractorError as e:
3062 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
3063 count += 1
3064 if count <= retries:
3065 continue
3066 raise
3067 if not browse:
3068 break
3069 response = try_get(browse, lambda x: x[1]['response'], dict)
3070 if not response:
3071 break
3072
3073 continuation_contents = try_get(
3074 response, lambda x: x['continuationContents'], dict)
3075 if continuation_contents:
3076 continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
3077 if continuation_renderer:
3078 for entry in self._playlist_entries(continuation_renderer):
3079 yield entry
3080 continuation = self._extract_continuation(continuation_renderer)
3081 continue
3082 continuation_renderer = continuation_contents.get('gridContinuation')
3083 if continuation_renderer:
3084 for entry in self._grid_entries(continuation_renderer):
3085 yield entry
3086 continuation = self._extract_continuation(continuation_renderer)
3087 continue
3088 continuation_renderer = continuation_contents.get('itemSectionContinuation')
3089 if continuation_renderer:
3090 for entry in self._post_thread_continuation_entries(continuation_renderer):
3091 yield entry
3092 continuation = self._extract_continuation(continuation_renderer)
3093 continue
3094
3095 continuation_items = try_get(
3096 response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
3097 if continuation_items:
3098 continuation_item = continuation_items[0]
3099 if not isinstance(continuation_item, dict):
3100 continue
3101 renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
3102 if renderer:
3103 video_list_renderer = {'contents': continuation_items}
3104 for entry in self._playlist_entries(video_list_renderer):
3105 yield entry
3106 continuation = self._extract_continuation(video_list_renderer)
3107 continue
3108
3109 break
3110
3111 @staticmethod
3112 def _extract_selected_tab(tabs):
3113 for tab in tabs:
3114 if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
3115 return tab['tabRenderer']
3116 else:
3117 raise ExtractorError('Unable to find selected tab')
3118
3119 @staticmethod
3120 def _extract_uploader(data):
3121 uploader = {}
3122 sidebar_renderer = try_get(
3123 data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
3124 if sidebar_renderer:
3125 for item in sidebar_renderer:
3126 if not isinstance(item, dict):
3127 continue
3128 renderer = item.get('playlistSidebarSecondaryInfoRenderer')
3129 if not isinstance(renderer, dict):
3130 continue
3131 owner = try_get(
3132 renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
3133 if owner:
3134 uploader['uploader'] = owner.get('text')
3135 uploader['uploader_id'] = try_get(
3136 owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
3137 uploader['uploader_url'] = urljoin(
3138 'https://www.youtube.com/',
3139 try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
3140 return uploader
3141
3142 @staticmethod
3143 def _extract_alert(data):
3144 alerts = []
3145 for alert in try_get(data, lambda x: x['alerts'], list) or []:
3146 if not isinstance(alert, dict):
3147 continue
3148 alert_text = try_get(
3149 alert, lambda x: x['alertRenderer']['text'], dict)
3150 if not alert_text:
3151 continue
3152 text = try_get(
3153 alert_text,
3154 (lambda x: x['simpleText'], lambda x: x['runs'][0]['text']),
3155 compat_str)
3156 if text:
3157 alerts.append(text)
3158 return '\n'.join(alerts)
3159
3160 def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
3161 selected_tab = self._extract_selected_tab(tabs)
3162 renderer = try_get(
3163 data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
3164 playlist_id = title = description = None
3165 if renderer:
3166 channel_title = renderer.get('title') or item_id
3167 tab_title = selected_tab.get('title')
3168 title = channel_title or item_id
3169 if tab_title:
3170 title += ' - %s' % tab_title
3171 description = renderer.get('description')
3172 playlist_id = renderer.get('externalId')
3173 renderer = try_get(
3174 data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
3175 if renderer:
3176 title = renderer.get('title')
3177 description = None
3178 playlist_id = item_id
3179 playlist = self.playlist_result(
3180 self._entries(selected_tab, identity_token),
3181 playlist_id=playlist_id, playlist_title=title,
3182 playlist_description=description)
3183 playlist.update(self._extract_uploader(data))
3184 return playlist
3185
3186 def _extract_from_playlist(self, item_id, url, data, playlist):
3187 title = playlist.get('title') or try_get(
3188 data, lambda x: x['titleText']['simpleText'], compat_str)
3189 playlist_id = playlist.get('playlistId') or item_id
3190 # Inline playlist rendition continuation does not always work
3191 # at Youtube side, so delegating regular tab-based playlist URL
3192 # processing whenever possible.
3193 playlist_url = urljoin(url, try_get(
3194 playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
3195 compat_str))
3196 if playlist_url and playlist_url != url:
3197 return self.url_result(
3198 playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
3199 video_title=title)
3200 return self.playlist_result(
3201 self._playlist_entries(playlist), playlist_id=playlist_id,
3202 playlist_title=title)
3203
3204 def _extract_identity_token(self, webpage, item_id):
3205 ytcfg = self._extract_ytcfg(item_id, webpage)
3206 if ytcfg:
3207 token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
3208 if token:
3209 return token
3210 return self._search_regex(
3211 r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
3212 'identity token', default=None)
3213
3214 def _real_extract(self, url):
3215 item_id = self._match_id(url)
3216 url = compat_urlparse.urlunparse(
3217 compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
3218 # Handle both video/playlist URLs
3219 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3220 video_id = qs.get('v', [None])[0]
3221 playlist_id = qs.get('list', [None])[0]
3222 if video_id and playlist_id:
3223 if self._downloader.params.get('noplaylist'):
3224 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
3225 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3226 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
3227 webpage = self._download_webpage(url, item_id)
3228 identity_token = self._extract_identity_token(webpage, item_id)
3229 data = self._extract_yt_initial_data(item_id, webpage)
3230 tabs = try_get(
3231 data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
3232 if tabs:
3233 return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
3234 playlist = try_get(
3235 data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
3236 if playlist:
3237 return self._extract_from_playlist(item_id, url, data, playlist)
3238 # Fallback to video extraction if no playlist alike page is recognized.
3239 # First check for the current video then try the v attribute of URL query.
3240 video_id = try_get(
3241 data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
3242 compat_str) or video_id
3243 if video_id:
3244 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
3245 # Capture and output alerts
3246 alert = self._extract_alert(data)
3247 if alert:
3248 raise ExtractorError(alert, expected=True)
3249 # Failed to recognize
3250 raise ExtractorError('Unable to recognize tab page')
3251
3252
3253class YoutubePlaylistIE(InfoExtractor):
3254 IE_DESC = 'YouTube.com playlists'
3255 _VALID_URL = r'''(?x)(?:
3256 (?:https?://)?
3257 (?:\w+\.)?
3258 (?:
3259 (?:
3260 youtube(?:kids)?\.com|
3261 invidio\.us
3262 )
3263 /.*?\?.*?\blist=
3264 )?
3265 (?P<id>%(playlist_id)s)
3266 )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3267 IE_NAME = 'youtube:playlist'
3268 _TESTS = [{
3269 'note': 'issue #673',
3270 'url': 'PLBB231211A4F62143',
3271 'info_dict': {
3272 'title': '[OLD]Team Fortress 2 (Class-based LP)',
3273 'id': 'PLBB231211A4F62143',
3274 'uploader': 'Wickydoo',
3275 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
3276 },
3277 'playlist_mincount': 29,
3278 }, {
3279 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3280 'info_dict': {
3281 'title': 'YDL_safe_search',
3282 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
3283 },
3284 'playlist_count': 2,
3285 'skip': 'This playlist is private',
3286 }, {
3287 'note': 'embedded',
3288 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3289 'playlist_count': 4,
3290 'info_dict': {
3291 'title': 'JODA15',
3292 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
3293 'uploader': 'milan',
3294 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
3295 }
3296 }, {
3297 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3298 'playlist_mincount': 982,
3299 'info_dict': {
3300 'title': '2018 Chinese New Singles (11/6 updated)',
3301 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
3302 'uploader': 'LBK',
3303 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
3304 }
3305 }, {
3306 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
3307 'only_matching': True,
3308 }, {
3309 # music album playlist
3310 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
3311 'only_matching': True,
3312 }]
3313
3314 @classmethod
3315 def suitable(cls, url):
3316 return False if YoutubeTabIE.suitable(url) else super(
3317 YoutubePlaylistIE, cls).suitable(url)
3318
3319 def _real_extract(self, url):
3320 playlist_id = self._match_id(url)
3321 qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
3322 if not qs:
3323 qs = {'list': playlist_id}
3324 return self.url_result(
3325 update_url_query('https://www.youtube.com/playlist', qs),
3326 ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3327
3328
3329class YoutubeYtBeIE(InfoExtractor):
3330 _VALID_URL = r'https?://youtu\.be/(?P<id>[0-9A-Za-z_-]{11})/*?.*?\blist=(?P<playlist_id>%(playlist_id)s)' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
3331 _TESTS = [{
3332 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
3333 'info_dict': {
3334 'id': 'yeWKywCrFtk',
3335 'ext': 'mp4',
3336 'title': 'Small Scale Baler and Braiding Rugs',
3337 'uploader': 'Backus-Page House Museum',
3338 'uploader_id': 'backuspagemuseum',
3339 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
3340 'upload_date': '20161008',
3341 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
3342 'categories': ['Nonprofits & Activism'],
3343 'tags': list,
3344 'like_count': int,
3345 'dislike_count': int,
3346 },
3347 'params': {
3348 'noplaylist': True,
3349 'skip_download': True,
3350 },
3351 }, {
3352 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
3353 'only_matching': True,
3354 }]
3355
3356 def _real_extract(self, url):
3357 mobj = re.match(self._VALID_URL, url)
3358 video_id = mobj.group('id')
3359 playlist_id = mobj.group('playlist_id')
3360 return self.url_result(
3361 update_url_query('https://www.youtube.com/watch', {
3362 'v': video_id,
3363 'list': playlist_id,
3364 'feature': 'youtu.be',
3365 }), ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
3366
3367
3368class YoutubeYtUserIE(InfoExtractor):
3369 _VALID_URL = r'ytuser:(?P<id>.+)'
3370 _TESTS = [{
3371 'url': 'ytuser:phihag',
3372 'only_matching': True,
3373 }]
3374
3375 def _real_extract(self, url):
3376 user_id = self._match_id(url)
3377 return self.url_result(
3378 'https://www.youtube.com/user/%s' % user_id,
3379 ie=YoutubeTabIE.ie_key(), video_id=user_id)
3380
3381
3382class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3383 IE_NAME = 'youtube:favorites'
3384 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3385 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3386 _LOGIN_REQUIRED = True
3387 _TESTS = [{
3388 'url': ':ytfav',
3389 'only_matching': True,
3390 }, {
3391 'url': ':ytfavorites',
3392 'only_matching': True,
3393 }]
3394
3395 def _real_extract(self, url):
3396 return self.url_result(
3397 'https://www.youtube.com/playlist?list=LL',
3398 ie=YoutubeTabIE.ie_key())
3399
3400
3401class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
3402 IE_DESC = 'YouTube.com searches'
3403 # there doesn't appear to be a real limit, for example if you search for
3404 # 'python' you get more than 8.000.000 results
3405 _MAX_RESULTS = float('inf')
3406 IE_NAME = 'youtube:search'
3407 _SEARCH_KEY = 'ytsearch'
3408 _SEARCH_PARAMS = None
3409 _TESTS = []
3410
3411 def _entries(self, query, n):
3412 data = {
3413 'context': {
3414 'client': {
3415 'clientName': 'WEB',
3416 'clientVersion': '2.20201021.03.00',
3417 }
3418 },
3419 'query': query,
3420 }
3421 if self._SEARCH_PARAMS:
3422 data['params'] = self._SEARCH_PARAMS
3423 total = 0
3424 for page_num in itertools.count(1):
3425 search = self._download_json(
3426 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3427 video_id='query "%s"' % query,
3428 note='Downloading page %s' % page_num,
3429 errnote='Unable to download API page', fatal=False,
3430 data=json.dumps(data).encode('utf8'),
3431 headers={'content-type': 'application/json'})
3432 if not search:
3433 break
3434 slr_contents = try_get(
3435 search,
3436 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3437 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3438 list)
3439 if not slr_contents:
3440 break
3441 isr_contents = try_get(
3442 slr_contents,
3443 lambda x: x[0]['itemSectionRenderer']['contents'],
3444 list)
3445 if not isr_contents:
3446 break
3447 for content in isr_contents:
3448 if not isinstance(content, dict):
3449 continue
3450 video = content.get('videoRenderer')
3451 if not isinstance(video, dict):
3452 continue
3453 video_id = video.get('videoId')
3454 if not video_id:
3455 continue
3456 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3457 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3458 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3459 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3460 view_count = int_or_none(self._search_regex(
3461 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3462 'view count', default=None))
3463 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3464 total += 1
3465 yield {
3466 '_type': 'url_transparent',
3467 'ie_key': YoutubeIE.ie_key(),
3468 'id': video_id,
3469 'url': video_id,
3470 'title': title,
3471 'description': description,
3472 'duration': duration,
3473 'view_count': view_count,
3474 'uploader': uploader,
3475 }
3476 if total == n:
3477 return
3478 token = try_get(
3479 slr_contents,
3480 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3481 compat_str)
3482 if not token:
3483 break
3484 data['continuation'] = token
3485
3486 def _get_n_results(self, query, n):
3487 """Get a specified number of results for a query"""
3488 return self.playlist_result(self._entries(query, n), query)
3489
3490
3491class YoutubeSearchDateIE(YoutubeSearchIE):
3492 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3493 _SEARCH_KEY = 'ytsearchdate'
3494 IE_DESC = 'YouTube.com searches, newest videos first'
3495 _SEARCH_PARAMS = 'CAI%3D'
3496
3497
3498r"""
3499class YoutubeSearchURLIE(YoutubeSearchIE):
3500 IE_DESC = 'YouTube.com search URLs'
3501 IE_NAME = 'youtube:search_url'
3502 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3503 _TESTS = [{
3504 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3505 'playlist_mincount': 5,
3506 'info_dict': {
3507 'title': 'youtube-dl test video',
3508 }
3509 }, {
3510 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3511 'only_matching': True,
3512 }]
3513
3514 def _real_extract(self, url):
3515 mobj = re.match(self._VALID_URL, url)
3516 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3517 webpage = self._download_webpage(url, query)
3518 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3519"""
3520
3521
3522class YoutubeFeedsInfoExtractor(YoutubeTabIE):
3523 """
3524 Base class for feed extractors
3525 Subclasses must define the _FEED_NAME property.
3526 """
3527 _LOGIN_REQUIRED = True
3528
3529 @property
3530 def IE_NAME(self):
3531 return 'youtube:%s' % self._FEED_NAME
3532
3533 def _real_initialize(self):
3534 self._login()
3535
3536 def _real_extract(self, url):
3537 return self.url_result(
3538 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3539 ie=YoutubeTabIE.ie_key())
3540
3541
3542class YoutubeWatchLaterIE(InfoExtractor):
3543 IE_NAME = 'youtube:watchlater'
3544 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3545 _VALID_URL = r':ytwatchlater'
3546 _TESTS = [{
3547 'url': ':ytwatchlater',
3548 'only_matching': True,
3549 }]
3550
3551 def _real_extract(self, url):
3552 return self.url_result(
3553 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
3554
3555
3556class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3557 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3558 _VALID_URL = r':ytrec(?:ommended)?'
3559 _FEED_NAME = 'recommended'
3560 _TESTS = [{
3561 'url': ':ytrec',
3562 'only_matching': True,
3563 }, {
3564 'url': ':ytrecommended',
3565 'only_matching': True,
3566 }]
3567
3568
3569class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3570 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3571 _VALID_URL = r':ytsubs(?:criptions)?'
3572 _FEED_NAME = 'subscriptions'
3573 _TESTS = [{
3574 'url': ':ytsubs',
3575 'only_matching': True,
3576 }, {
3577 'url': ':ytsubscriptions',
3578 'only_matching': True,
3579 }]
3580
3581
3582class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3583 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3584 _VALID_URL = r':ythistory'
3585 _FEED_NAME = 'history'
3586 _TESTS = [{
3587 'url': ':ythistory',
3588 'only_matching': True,
3589 }]
3590
3591
3592class YoutubeTruncatedURLIE(InfoExtractor):
3593 IE_NAME = 'youtube:truncated_url'
3594 IE_DESC = False # Do not list
3595 _VALID_URL = r'''(?x)
3596 (?:https?://)?
3597 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3598 (?:watch\?(?:
3599 feature=[a-z_]+|
3600 annotation_id=annotation_[^&]+|
3601 x-yt-cl=[0-9]+|
3602 hl=[^&]*|
3603 t=[0-9]+
3604 )?
3605 |
3606 attribution_link\?a=[^&]+
3607 )
3608 $
3609 '''
3610
3611 _TESTS = [{
3612 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3613 'only_matching': True,
3614 }, {
3615 'url': 'https://www.youtube.com/watch?',
3616 'only_matching': True,
3617 }, {
3618 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3619 'only_matching': True,
3620 }, {
3621 'url': 'https://www.youtube.com/watch?feature=foo',
3622 'only_matching': True,
3623 }, {
3624 'url': 'https://www.youtube.com/watch?hl=en-GB',
3625 'only_matching': True,
3626 }, {
3627 'url': 'https://www.youtube.com/watch?t=2372',
3628 'only_matching': True,
3629 }]
3630
3631 def _real_extract(self, url):
3632 raise ExtractorError(
3633 'Did you forget to quote the URL? Remember that & is a meta '
3634 'character in most shells, so you want to put the URL in quotes, '
3635 'like youtube-dl '
3636 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3637 ' or simply youtube-dl BaW_jenozKc .',
3638 expected=True)
3639
3640
3641class YoutubeTruncatedIDIE(InfoExtractor):
3642 IE_NAME = 'youtube:truncated_id'
3643 IE_DESC = False # Do not list
3644 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3645
3646 _TESTS = [{
3647 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3648 'only_matching': True,
3649 }]
3650
3651 def _real_extract(self, url):
3652 video_id = self._match_id(url)
3653 raise ExtractorError(
3654 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3655 expected=True)
3656