· 5 years ago · Oct 23, 2020, 07:34 PM
1# coding: utf-8
2
3from __future__ import unicode_literals
4
5
6import itertools
7import json
8import os.path
9import random
10import re
11import time
12import traceback
13
14from .common import InfoExtractor, SearchInfoExtractor
15from ..jsinterp import JSInterpreter
16from ..swfinterp import SWFInterpreter
17from ..compat import (
18 compat_chr,
19 compat_HTTPError,
20 compat_kwargs,
21 compat_parse_qs,
22 compat_urllib_parse_unquote,
23 compat_urllib_parse_unquote_plus,
24 compat_urllib_parse_urlencode,
25 compat_urllib_parse_urlparse,
26 compat_urlparse,
27 compat_str,
28)
29from ..utils import (
30 bool_or_none,
31 clean_html,
32 error_to_compat_str,
33 extract_attributes,
34 ExtractorError,
35 float_or_none,
36 get_element_by_attribute,
37 get_element_by_id,
38 int_or_none,
39 mimetype2ext,
40 orderedSet,
41 parse_codecs,
42 parse_duration,
43 remove_quotes,
44 remove_start,
45 smuggle_url,
46 str_or_none,
47 str_to_int,
48 try_get,
49 unescapeHTML,
50 unified_strdate,
51 unsmuggle_url,
52 uppercase_escape,
53 url_or_none,
54 urlencode_postdata,
55)
56
57
58class YoutubeBaseInfoExtractor(InfoExtractor):
59 """Provide base functions for Youtube extractors"""
60 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
61 _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
62
63 _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
64 _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
65 _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
66
67 _NETRC_MACHINE = 'youtube'
68 # If True it will raise an error if no login info is provided
69 _LOGIN_REQUIRED = False
70
71 _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
72
73 _YOUTUBE_CLIENT_HEADERS = {
74 'x-youtube-client-name': '1',
75 'x-youtube-client-version': '1.20200609.04.02',
76 }
77
78 def _set_language(self):
79 self._set_cookie(
80 '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
81 # YouTube sets the expire time to about two months
82 expire_time=time.time() + 2 * 30 * 24 * 3600)
83
84 def _ids_to_results(self, ids):
85 return [
86 self.url_result(vid_id, 'Youtube', video_id=vid_id)
87 for vid_id in ids]
88
89 def _login(self):
90 """
91 Attempt to log in to YouTube.
92 True is returned if successful or skipped.
93 False is returned if login failed.
94 If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
95 """
96 username, password = self._get_login_info()
97 # No authentication to be performed
98 if username is None:
99 if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
100 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
101 return True
102
103 login_page = self._download_webpage(
104 self._LOGIN_URL, None,
105 note='Downloading login page',
106 errnote='unable to fetch login page', fatal=False)
107 if login_page is False:
108 return
109
110 login_form = self._hidden_inputs(login_page)
111
112 def req(url, f_req, note, errnote):
113 data = login_form.copy()
114 data.update({
115 'pstMsg': 1,
116 'checkConnection': 'youtube',
117 'checkedDomains': 'youtube',
118 'hl': 'en',
119 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
120 'f.req': json.dumps(f_req),
121 'flowName': 'GlifWebSignIn',
122 'flowEntry': 'ServiceLogin',
123 # TODO: reverse actual botguard identifier generation algo
124 'bgRequest': '["identifier",""]',
125 })
126 return self._download_json(
127 url, None, note=note, errnote=errnote,
128 transform_source=lambda s: re.sub(r'^[^[]*', '', s),
129 fatal=False,
130 data=urlencode_postdata(data), headers={
131 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
132 'Google-Accounts-XSRF': 1,
133 })
134
135 def warn(message):
136 self._downloader.report_warning(message)
137
138 lookup_req = [
139 username,
140 None, [], None, 'US', None, None, 2, False, True,
141 [
142 None, None,
143 [2, 1, None, 1,
144 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
145 None, [], 4],
146 1, [None, None, []], None, None, None, True
147 ],
148 username,
149 ]
150
151 lookup_results = req(
152 self._LOOKUP_URL, lookup_req,
153 'Looking up account info', 'Unable to look up account info')
154
155 if lookup_results is False:
156 return False
157
158 user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
159 if not user_hash:
160 warn('Unable to extract user hash')
161 return False
162
163 challenge_req = [
164 user_hash,
165 None, 1, None, [1, None, None, None, [password, None, True]],
166 [
167 None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
168 1, [None, None, []], None, None, None, True
169 ]]
170
171 challenge_results = req(
172 self._CHALLENGE_URL, challenge_req,
173 'Logging in', 'Unable to log in')
174
175 if challenge_results is False:
176 return
177
178 login_res = try_get(challenge_results, lambda x: x[0][5], list)
179 if login_res:
180 login_msg = try_get(login_res, lambda x: x[5], compat_str)
181 warn(
182 'Unable to login: %s' % 'Invalid password'
183 if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
184 return False
185
186 res = try_get(challenge_results, lambda x: x[0][-1], list)
187 if not res:
188 warn('Unable to extract result entry')
189 return False
190
191 login_challenge = try_get(res, lambda x: x[0][0], list)
192 if login_challenge:
193 challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
194 if challenge_str == 'TWO_STEP_VERIFICATION':
195 # SEND_SUCCESS - TFA code has been successfully sent to phone
196 # QUOTA_EXCEEDED - reached the limit of TFA codes
197 status = try_get(login_challenge, lambda x: x[5], compat_str)
198 if status == 'QUOTA_EXCEEDED':
199 warn('Exceeded the limit of TFA codes, try later')
200 return False
201
202 tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
203 if not tl:
204 warn('Unable to extract TL')
205 return False
206
207 tfa_code = self._get_tfa_info('2-step verification code')
208
209 if not tfa_code:
210 warn(
211 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
212 '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
213 return False
214
215 tfa_code = remove_start(tfa_code, 'G-')
216
217 tfa_req = [
218 user_hash, None, 2, None,
219 [
220 9, None, None, None, None, None, None, None,
221 [None, tfa_code, True, 2]
222 ]]
223
224 tfa_results = req(
225 self._TFA_URL.format(tl), tfa_req,
226 'Submitting TFA code', 'Unable to submit TFA code')
227
228 if tfa_results is False:
229 return False
230
231 tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
232 if tfa_res:
233 tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
234 warn(
235 'Unable to finish TFA: %s' % 'Invalid TFA code'
236 if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
237 return False
238
239 check_cookie_url = try_get(
240 tfa_results, lambda x: x[0][-1][2], compat_str)
241 else:
242 CHALLENGES = {
243 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
244 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
245 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
246 }
247 challenge = CHALLENGES.get(
248 challenge_str,
249 '%s returned error %s.' % (self.IE_NAME, challenge_str))
250 warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
251 return False
252 else:
253 check_cookie_url = try_get(res, lambda x: x[2], compat_str)
254
255 if not check_cookie_url:
256 warn('Unable to extract CheckCookie URL')
257 return False
258
259 check_cookie_results = self._download_webpage(
260 check_cookie_url, None, 'Checking cookie', fatal=False)
261
262 if check_cookie_results is False:
263 return False
264
265 if 'https://myaccount.google.com/' not in check_cookie_results:
266 warn('Unable to log in')
267 return False
268
269 return True
270
271 def _download_webpage_handle(self, *args, **kwargs):
272 query = kwargs.get('query', {}).copy()
273 query['disable_polymer'] = 'true'
274 kwargs['query'] = query
275 return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
276 *args, **compat_kwargs(kwargs))
277
278 def _real_initialize(self):
279 if self._downloader is None:
280 return
281 self._set_language()
282 if not self._login():
283 return
284
285
286class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
287 # Extract entries from page with "Load more" button
288 def _entries(self, page, playlist_id):
289 more_widget_html = content_html = page
290 for page_num in itertools.count(1):
291 for entry in self._process_page(content_html):
292 yield entry
293
294 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
295 if not mobj:
296 break
297
298 count = 0
299 retries = 3
300 while count <= retries:
301 try:
302 # Downloading page may result in intermittent 5xx HTTP error
303 # that is usually worked around with a retry
304 more = self._download_json(
305 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
306 'Downloading page #%s%s'
307 % (page_num, ' (retry #%d)' % count if count else ''),
308 transform_source=uppercase_escape,
309 headers=self._YOUTUBE_CLIENT_HEADERS)
310 break
311 except ExtractorError as e:
312 if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
313 count += 1
314 if count <= retries:
315 continue
316 raise
317
318 content_html = more['content_html']
319 if not content_html.strip():
320 # Some webpages show a "Load more" button but they don't
321 # have more videos
322 break
323 more_widget_html = more['load_more_widget_html']
324
325
326class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
327 def _process_page(self, content):
328 for video_id, video_title in self.extract_videos_from_page(content):
329 yield self.url_result(video_id, 'Youtube', video_id, video_title)
330
331 def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
332 for mobj in re.finditer(video_re, page):
333 # The link with index 0 is not the first video of the playlist (not sure if still actual)
334 if 'index' in mobj.groupdict() and mobj.group('id') == '0':
335 continue
336 video_id = mobj.group('id')
337 video_title = unescapeHTML(
338 mobj.group('title')) if 'title' in mobj.groupdict() else None
339 if video_title:
340 video_title = video_title.strip()
341 if video_title == '► Play all':
342 video_title = None
343 try:
344 idx = ids_in_page.index(video_id)
345 if video_title and not titles_in_page[idx]:
346 titles_in_page[idx] = video_title
347 except ValueError:
348 ids_in_page.append(video_id)
349 titles_in_page.append(video_title)
350
351 def extract_videos_from_page(self, page):
352 ids_in_page = []
353 titles_in_page = []
354 self.extract_videos_from_page_impl(
355 self._VIDEO_RE, page, ids_in_page, titles_in_page)
356 return zip(ids_in_page, titles_in_page)
357
358
359class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
360 def _process_page(self, content):
361 for playlist_id in orderedSet(re.findall(
362 r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
363 content)):
364 yield self.url_result(
365 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
366
367 def _real_extract(self, url):
368 playlist_id = self._match_id(url)
369 webpage = self._download_webpage(url, playlist_id)
370 title = self._og_search_title(webpage, fatal=False)
371 return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
372
373
374class YoutubeIE(YoutubeBaseInfoExtractor):
375 IE_DESC = 'YouTube.com'
376 _VALID_URL = r"""(?x)^
377 (
378 (?:https?://|//) # http(s):// or protocol-independent URL
379 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
380 (?:www\.)?deturl\.com/www\.youtube\.com/|
381 (?:www\.)?pwnyoutube\.com/|
382 (?:www\.)?hooktube\.com/|
383 (?:www\.)?yourepeat\.com/|
384 tube\.majestyc\.net/|
385 # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
386 (?:(?:www|dev)\.)?invidio\.us/|
387 (?:(?:www|no)\.)?invidiou\.sh/|
388 (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/|
389 (?:www\.)?invidious\.kabi\.tk/|
390 (?:www\.)?invidious\.13ad\.de/|
391 (?:www\.)?invidious\.mastodon\.host/|
392 (?:www\.)?invidious\.nixnet\.xyz/|
393 (?:www\.)?invidious\.drycat\.fr/|
394 (?:www\.)?tube\.poal\.co/|
395 (?:www\.)?vid\.wxzm\.sx/|
396 (?:www\.)?yewtu\.be/|
397 (?:www\.)?yt\.elukerio\.org/|
398 (?:www\.)?yt\.lelux\.fi/|
399 (?:www\.)?invidious\.ggc-project\.de/|
400 (?:www\.)?yt\.maisputain\.ovh/|
401 (?:www\.)?invidious\.13ad\.de/|
402 (?:www\.)?invidious\.toot\.koeln/|
403 (?:www\.)?invidious\.fdn\.fr/|
404 (?:www\.)?watch\.nettohikari\.com/|
405 (?:www\.)?kgg2m7yk5aybusll\.onion/|
406 (?:www\.)?qklhadlycap4cnod\.onion/|
407 (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
408 (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
409 (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
410 (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
411 (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
412 (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
413 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
414 (?:.*?\#/)? # handle anchor (#/) redirect urls
415 (?: # the various things that can precede the ID:
416 (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
417 |(?: # or the v= param in all its forms
418 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
419 (?:\?|\#!?) # the params delimiter ? or # or #!
420 (?:.*?[&;])?? # any other preceding param (like /?s=tuff&v=xxxx or ?s=tuff&v=V36LpHqtcDY)
421 v=
422 )
423 ))
424 |(?:
425 youtu\.be| # just youtu.be/xxxx
426 vid\.plus| # or vid.plus/xxxx
427 zwearz\.com/watch| # or zwearz.com/watch/xxxx
428 )/
429 |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
430 )
431 )? # all until now is optional -> you can pass the naked ID
432 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
433 (?!.*?\blist=
434 (?:
435 %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
436 WL # WL are handled by the watch later IE
437 )
438 )
439 (?(1).+)? # if we found the ID, everything can follow
440 $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
441 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
442 _PLAYER_INFO_RE = (
443 r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
444 r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
445 )
446 _formats = {
447 '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
448 '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
449 '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
450 '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
451 '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
452 '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
453 '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
454 '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
455 # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
456 '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
457 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
458 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
459 '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
460 '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
461 '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
462 '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
463 '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
464 '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
465
466
467 # 3D videos
468 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
469 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
470 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
471 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
472 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
473 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
474 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
475
476 # Apple HTTP Live Streaming
477 '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
478 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
479 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
480 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
481 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
482 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
483 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
484 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
485
486 # DASH mp4 video
487 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
488 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
489 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
490 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
491 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
492 '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
493 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
494 '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
495 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
496 '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
497 '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
498 '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
499
500 # Dash mp4 audio
501 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
502 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
503 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
504 '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
505 '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
506 '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
507 '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
508
509 # Dash webm
510 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
511 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
512 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
513 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
514 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
515 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
516 '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
517 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
518 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
519 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
520 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
521 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
522 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
523 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
524 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
525 # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
526 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
527 '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
528 '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
529 '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
530 '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
531 '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
532
533 # Dash webm audio
534 '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
535 '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
536
537 # Dash webm audio with opus inside
538 '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
539 '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
540 '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
541
542 # RTMP (unnamed)
543 '_rtmp': {'protocol': 'rtmp'},
544
545 # av01 video only formats sometimes served with "unknown" codecs
546 '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
547 '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
548 '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
549 '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
550 }
551 _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
552
553 _GEO_BYPASS = False
554
555 IE_NAME = 'youtube'
556 _TESTS = [
557 {
558 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9',
559 'info_dict': {
560 'id': 'BaW_jenozKc',
561 'ext': 'mp4',
562 'title': 'youtube-dl test video "\'/\\ä↭?',
563 'uploader': 'Philipp Hagemeister',
564 'uploader_id': 'phihag',
565 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
566 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
567 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
568 'upload_date': '20121002',
569 'description': 'test chars: "\'/\\ä↭?\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
570 'categories': ['Science & Technology'],
571 'tags': ['youtube-dl'],
572 'duration': 10,
573 'view_count': int,
574 'like_count': int,
575 'dislike_count': int,
576 'start_time': 1,
577 'end_time': 9,
578 }
579 },
580 {
581 'url': 'https://www.youtube.com/watch?v=UxxajLWwzqY',
582 'note': 'Test generic use_cipher_signature video (#897)',
583 'info_dict': {
584 'id': 'UxxajLWwzqY',
585 'ext': 'mp4',
586 'upload_date': '20120506',
587 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
588 'alt_title': 'I Love It (feat. Charli XCX)',
589 'description': 'md5:19a2f98d9032b9311e686ed039564f63',
590 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
591 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
592 'iconic ep', 'iconic', 'love', 'it'],
593 'duration': 180,
594 'uploader': 'Icona Pop',
595 'uploader_id': 'IconaPop',
596 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
597 'creator': 'Icona Pop',
598 'track': 'I Love It (feat. Charli XCX)',
599 'artist': 'Icona Pop',
600 }
601 },
602 {
603 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
604 'note': 'Test VEVO video with age protection (#956)',
605 'info_dict': {
606 'id': '07FYdnEawAQ',
607 'ext': 'mp4',
608 'upload_date': '20130703',
609 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
610 'alt_title': 'Tunnel Vision',
611 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
612 'duration': 419,
613 'uploader': 'justintimberlakeVEVO',
614 'uploader_id': 'justintimberlakeVEVO',
615 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
616 'creator': 'Justin Timberlake',
617 'track': 'Tunnel Vision',
618 'artist': 'Justin Timberlake',
619 'age_limit': 18,
620 }
621 },
622 {
623 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
624 'note': 'Embed-only video (#1746)',
625 'info_dict': {
626 'id': 'yZIXLfi8CZQ',
627 'ext': 'mp4',
628 'upload_date': '20120608',
629 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
630 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
631 'uploader': 'SET India',
632 'uploader_id': 'setindia',
633 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
634 'age_limit': 18,
635 }
636 },
637 {
638 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
639 'note': 'Use the first video ID in the URL',
640 'info_dict': {
641 'id': 'BaW_jenozKc',
642 'ext': 'mp4',
643 'title': 'youtube-dl test video "\'/\\ä↭?',
644 'uploader': 'Philipp Hagemeister',
645 'uploader_id': 'phihag',
646 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
647 'upload_date': '20121002',
648 'description': 'test chars: "\'/\\ä↭?\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
649 'categories': ['Science & Technology'],
650 'tags': ['youtube-dl'],
651 'duration': 10,
652 'view_count': int,
653 'like_count': int,
654 'dislike_count': int,
655 },
656 'params': {
657 'skip_download': True,
658 },
659 },
660 {
661 'url': 'https://www.youtube.com/watch?v=a9LDPn-MO4I',
662 'note': '256k DASH audio (format 141) via DASH manifest',
663 'info_dict': {
664 'id': 'a9LDPn-MO4I',
665 'ext': 'm4a',
666 'upload_date': '20121002',
667 'uploader_id': '8KVIDEO',
668 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
669 'description': '',
670 'uploader': '8KVIDEO',
671 'title': 'UHDTV TEST 8K VIDEO.mp4'
672 },
673 'params': {
674 'youtube_include_dash_manifest': True,
675 'format': '141',
676 },
677 'skip': 'format 141 not served anymore',
678 },
679 # DASH manifest with encrypted signature
680 {
681 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
682 'info_dict': {
683 'id': 'IB3lcPjvWLA',
684 'ext': 'm4a',
685 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
686 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
687 'duration': 244,
688 'uploader': 'AfrojackVEVO',
689 'uploader_id': 'AfrojackVEVO',
690 'upload_date': '20131011',
691 },
692 'params': {
693 'youtube_include_dash_manifest': True,
694 'format': '141/bestaudio[ext=m4a]',
695 },
696 },
697 # JS player signature function name containing $
698 {
699 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
700 'info_dict': {
701 'id': 'nfWlot6h_JM',
702 'ext': 'm4a',
703 'title': 'Taylor Swift - Shake It Off',
704 'description': 'md5:307195cd21ff7fa352270fe884570ef0',
705 'duration': 242,
706 'uploader': 'TaylorSwiftVEVO',
707 'uploader_id': 'TaylorSwiftVEVO',
708 'upload_date': '20140818',
709 },
710 'params': {
711 'youtube_include_dash_manifest': True,
712 'format': '141/bestaudio[ext=m4a]',
713 },
714 },
715 # Controversy video
716 {
717 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
718 'info_dict': {
719 'id': 'T4XJQO3qol8',
720 'ext': 'mp4',
721 'duration': 219,
722 'upload_date': '20100909',
723 'uploader': 'Amazing Atheist',
724 'uploader_id': 'TheAmazingAtheist',
725 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
726 'title': 'Burning Everyone\'s Koran',
727 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
728 }
729 },
730 # Normal age-gate video (No vevo, embed allowed)
731 {
732 'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
733 'info_dict': {
734 'id': 'HtVdAasjOgU',
735 'ext': 'mp4',
736 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer',
737 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
738 'duration': 142,
739 'uploader': 'The Witcher',
740 'uploader_id': 'WitcherGame',
741 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
742 'upload_date': '20140605',
743 'age_limit': 18,
744 },
745 },
746 # Age-gate video with encrypted signature
747 {
748 'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
749 'info_dict': {
750 'id': '6kLq3WMV1nU',
751 'ext': 'mp4',
752 'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
753 'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
754 'duration': 246,
755 'uploader': 'LloydVEVO',
756 'uploader_id': 'LloydVEVO',
757 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
758 'upload_date': '20110629',
759 'age_limit': 18,
760 },
761 },
762 # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
763 # YouTube Red ad is not captured for creator
764 {
765 'url': '__2ABJjxzNo',
766 'info_dict': {
767 'id': '__2ABJjxzNo',
768 'ext': 'mp4',
769 'duration': 266,
770 'upload_date': '20100430',
771 'uploader_id': 'deadmau5',
772 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
773 'creator': 'Dada Life, deadmau5',
774 'description': 'md5:12c56784b8032162bb936a5f76d55360',
775 'uploader': 'deadmau5',
776 'title': 'Deadmau5 - Some Chords (HD)',
777 'alt_title': 'This Machine Kills Some Chords',
778 },
779 'expected_warnings': [
780 'DASH manifest missing',
781 ]
782 },
783 # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
784 {
785 'url': 'lqQg6PlCWgI',
786 'info_dict': {
787 'id': 'lqQg6PlCWgI',
788 'ext': 'mp4',
789 'duration': 6085,
790 'upload_date': '20150827',
791 'uploader_id': 'olympic',
792 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
793 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
794 'uploader': 'Olympic',
795 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
796 },
797 'params': {
798 'skip_download': 'requires avconv',
799 }
800 },
801 # Non-square pixels
802 {
803 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
804 'info_dict': {
805 'id': '_b-2C3KPAM0',
806 'ext': 'mp4',
807 'stretched_ratio': 16 / 9.,
808 'duration': 85,
809 'upload_date': '20110310',
810 'uploader_id': 'AllenMeow',
811 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
812 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
813 'uploader': '孫ᄋᄅ',
814 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
815 },
816 },
817 # url_encoded_fmt_stream_map is empty string
818 {
819 'url': 'qEJwOuvDf7I',
820 'info_dict': {
821 'id': 'qEJwOuvDf7I',
822 'ext': 'webm',
823 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
824 'description': '',
825 'upload_date': '20150404',
826 'uploader_id': 'spbelect',
827 'uploader': 'Наблюдатели Петербурга',
828 },
829 'params': {
830 'skip_download': 'requires avconv',
831 },
832 'skip': 'This live event has ended.',
833 },
834 # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097)
835 {
836 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y',
837 'info_dict': {
838 'id': 'FIl7x6_3R5Y',
839 'ext': 'webm',
840 'title': 'md5:7b81415841e02ecd4313668cde88737a',
841 'description': 'md5:116377fd2963b81ec4ce64b542173306',
842 'duration': 220,
843 'upload_date': '20150625',
844 'uploader_id': 'dorappi2000',
845 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
846 'uploader': 'dorappi2000',
847 'formats': 'mincount:31',
848 },
849 'skip': 'not actual anymore',
850 },
851 # DASH manifest with segment_list
852 {
853 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8',
854 'md5': '8ce563a1d667b599d21064e982ab9e31',
855 'info_dict': {
856 'id': 'CsmdDsKjzN8',
857 'ext': 'mp4',
858 'upload_date': '20150501', # According to '<meta itemprop="datePublished"', but in other places it's 20150510
859 'uploader': 'Airtek',
860 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
861 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
862 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
863 },
864 'params': {
865 'youtube_include_dash_manifest': True,
866 'format': '135', # bestvideo
867 },
868 'skip': 'This live event has ended.',
869 },
870 {
871 # Multifeed videos (multiple cameras), URL is for Main Camera
872 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
873 'info_dict': {
874 'id': 'jqWvoWXjCVs',
875 'title': 'teamPGP: Rocket League Noob Stream',
876 'description': 'md5:dc7872fb300e143831327f1bae3af010',
877 },
878 'playlist': [{
879 'info_dict': {
880 'id': 'jqWvoWXjCVs',
881 'ext': 'mp4',
882 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
883 'description': 'md5:dc7872fb300e143831327f1bae3af010',
884 'duration': 7335,
885 'upload_date': '20150721',
886 'uploader': 'Beer Games Beer',
887 'uploader_id': 'beergamesbeer',
888 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
889 'license': 'Standard YouTube License',
890 },
891 }, {
892 'info_dict': {
893 'id': '6h8e8xoXJzg',
894 'ext': 'mp4',
895 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
896 'description': 'md5:dc7872fb300e143831327f1bae3af010',
897 'duration': 7337,
898 'upload_date': '20150721',
899 'uploader': 'Beer Games Beer',
900 'uploader_id': 'beergamesbeer',
901 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
902 'license': 'Standard YouTube License',
903 },
904 }, {
905 'info_dict': {
906 'id': 'PUOgX5z9xZw',
907 'ext': 'mp4',
908 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
909 'description': 'md5:dc7872fb300e143831327f1bae3af010',
910 'duration': 7337,
911 'upload_date': '20150721',
912 'uploader': 'Beer Games Beer',
913 'uploader_id': 'beergamesbeer',
914 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
915 'license': 'Standard YouTube License',
916 },
917 }, {
918 'info_dict': {
919 'id': 'teuwxikvS5k',
920 'ext': 'mp4',
921 'title': 'teamPGP: Rocket League Noob Stream (zim)',
922 'description': 'md5:dc7872fb300e143831327f1bae3af010',
923 'duration': 7334,
924 'upload_date': '20150721',
925 'uploader': 'Beer Games Beer',
926 'uploader_id': 'beergamesbeer',
927 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
928 'license': 'Standard YouTube License',
929 },
930 }],
931 'params': {
932 'skip_download': True,
933 },
934 'skip': 'This video is not available.',
935 },
936 {
937 # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
938 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo',
939 'info_dict': {
940 'id': 'gVfLd0zydlo',
941 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
942 },
943 'playlist_count': 2,
944 'skip': 'Not multifeed anymore',
945 },
946 {
947 'url': 'https://vid.plus/FlRa-iH7PGw',
948 'only_matching': True,
949 },
950 {
951 'url': 'https://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
952 'only_matching': True,
953 },
954 {
955 # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468)
956 # Also tests cut-off URL expansion in video description (see
957 # https://github.com/ytdl-org/youtube-dl/issues/1892,
958 # https://github.com/ytdl-org/youtube-dl/issues/8164)
959 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg',
960 'info_dict': {
961 'id': 'lsguqyKfVQg',
962 'ext': 'mp4',
963 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21',
964 'alt_title': 'Dark Walk - Position Music',
965 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
966 'duration': 133,
967 'upload_date': '20151119',
968 'uploader_id': 'IronSoulElf',
969 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
970 'uploader': 'IronSoulElf',
971 'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
972 'track': 'Dark Walk - Position Music',
973 'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
974 'album': 'Position Music - Production Music Vol. 143 - Dark Walk',
975 },
976 'params': {
977 'skip_download': True,
978 },
979 },
980 {
981 # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468)
982 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8',
983 'only_matching': True,
984 },
985 {
986 # Video with yt:stretch=17:0
987 'url': 'https://www.youtube.com/watch?v=Q39EVAstoRM',
988 'info_dict': {
989 'id': 'Q39EVAstoRM',
990 'ext': 'mp4',
991 'title': 'Clash Of Clans#14 Dicas De Ataque Para CV 4',
992 'description': 'md5:ee18a25c350637c8faff806845bddee9',
993 'upload_date': '20151107',
994 'uploader_id': 'UCCr7TALkRbo3EtFzETQF1LA',
995 'uploader': 'CH GAMER DROID',
996 },
997 'params': {
998 'skip_download': True,
999 },
1000 'skip': 'This video does not exist.',
1001 },
1002 {
1003 # Video licensed under Creative Commons
1004 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
1005 'info_dict': {
1006 'id': 'M4gD1WSo5mA',
1007 'ext': 'mp4',
1008 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
1009 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
1010 'duration': 721,
1011 'upload_date': '20150127',
1012 'uploader_id': 'BerkmanCenter',
1013 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
1014 'uploader': 'The Berkman Klein Center for Internet & Society',
1015 'license': 'Creative Commons Attribution license (reuse allowed)',
1016 },
1017 'params': {
1018 'skip_download': True,
1019 },
1020 },
1021 {
1022 # Channel-like uploader_url
1023 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
1024 'info_dict': {
1025 'id': 'eQcmzGIKrzg',
1026 'ext': 'mp4',
1027 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
1028 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
1029 'duration': 4060,
1030 'upload_date': '20151119',
1031 'uploader': 'Bernie Sanders',
1032 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
1033 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
1034 'license': 'Creative Commons Attribution license (reuse allowed)',
1035 },
1036 'params': {
1037 'skip_download': True,
1038 },
1039 },
1040 {
1041 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',
1042 'only_matching': True,
1043 },
1044 {
1045 # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059)
1046 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
1047 'only_matching': True,
1048 },
1049 {
1050 # Rental video preview
1051 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg',
1052 'info_dict': {
1053 'id': 'uGpuVWrhIzE',
1054 'ext': 'mp4',
1055 'title': 'Piku - Trailer',
1056 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb',
1057 'upload_date': '20150811',
1058 'uploader': 'FlixMatrix',
1059 'uploader_id': 'FlixMatrixKaravan',
1060 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan',
1061 'license': 'Standard YouTube License',
1062 },
1063 'params': {
1064 'skip_download': True,
1065 },
1066 'skip': 'This video is not available.',
1067 },
1068 {
1069 # YouTube Red video with episode data
1070 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4',
1071 'info_dict': {
1072 'id': 'iqKdEhx-dD4',
1073 'ext': 'mp4',
1074 'title': 'Isolation - Mind Field (Ep 1)',
1075 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
1076 'duration': 2085,
1077 'upload_date': '20170118',
1078 'uploader': 'Vsauce',
1079 'uploader_id': 'Vsauce',
1080 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
1081 'series': 'Mind Field',
1082 'season_number': 1,
1083 'episode_number': 1,
1084 },
1085 'params': {
1086 'skip_download': True,
1087 },
1088 'expected_warnings': [
1089 'Skipping DASH manifest',
1090 ],
1091 },
1092 {
1093 # The following content has been identified by the YouTube community
1094 # as inappropriate or offensive to some audiences.
1095 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
1096 'info_dict': {
1097 'id': '6SJNVb0GnPI',
1098 'ext': 'mp4',
1099 'title': 'Race Differences in Intelligence',
1100 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
1101 'duration': 965,
1102 'upload_date': '20140124',
1103 'uploader': 'New Century Foundation',
1104 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
1105 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
1106 },
1107 'params': {
1108 'skip_download': True,
1109 },
1110 },
1111 {
1112 # itag 212
1113 'url': '1t24XAntNCY',
1114 'only_matching': True,
1115 },
1116 {
1117 # geo restricted to JP
1118 'url': 'sJL6WA-aGkQ',
1119 'only_matching': True,
1120 },
1121 {
1122 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
1123 'only_matching': True,
1124 },
1125 {
1126 'url': 'https://invidio.us/watch?v=BaW_jenozKc',
1127 'only_matching': True,
1128 },
1129 {
1130 # DRM protected
1131 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
1132 'only_matching': True,
1133 },
1134 {
1135 # Video with unsupported adaptive stream type formats
1136 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
1137 'info_dict': {
1138 'id': 'Z4Vy8R84T1U',
1139 'ext': 'mp4',
1140 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
1141 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
1142 'duration': 433,
1143 'upload_date': '20130923',
1144 'uploader': 'Amelia Putri Harwita',
1145 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
1146 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
1147 'formats': 'maxcount:10',
1148 },
1149 'params': {
1150 'skip_download': True,
1151 'youtube_include_dash_manifest': False,
1152 },
1153 'skip': 'not actual anymore',
1154 },
1155 {
1156 # Youtube Music Auto-generated description
1157 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
1158 'info_dict': {
1159 'id': 'MgNrAu2pzNs',
1160 'ext': 'mp4',
1161 'title': 'Voyeur Girl',
1162 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
1163 'upload_date': '20190312',
1164 'uploader': 'Stephen - Topic',
1165 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
1166 'artist': 'Stephen',
1167 'track': 'Voyeur Girl',
1168 'album': 'it\'s too much love to know my dear',
1169 'release_date': '20190313',
1170 'release_year': 2019,
1171 },
1172 'params': {
1173 'skip_download': True,
1174 },
1175 },
1176 {
1177 # Youtube Music Auto-generated description
1178 # Retrieve 'artist' field from 'Artist:' in video description
1179 # when it is present on youtube music video
1180 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
1181 'info_dict': {
1182 'id': 'k0jLE7tTwjY',
1183 'ext': 'mp4',
1184 'title': 'Latch Feat. Sam Smith',
1185 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
1186 'upload_date': '20150110',
1187 'uploader': 'Various Artists - Topic',
1188 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
1189 'artist': 'Disclosure',
1190 'track': 'Latch Feat. Sam Smith',
1191 'album': 'Latch Featuring Sam Smith',
1192 'release_date': '20121008',
1193 'release_year': 2012,
1194 },
1195 'params': {
1196 'skip_download': True,
1197 },
1198 },
1199 {
1200 # Youtube Music Auto-generated description
1201 # handle multiple artists on youtube music video
1202 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
1203 'info_dict': {
1204 'id': '74qn0eJSjpA',
1205 'ext': 'mp4',
1206 'title': 'Eastside',
1207 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
1208 'upload_date': '20180710',
1209 'uploader': 'Benny Blanco - Topic',
1210 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
1211 'artist': 'benny blanco, Halsey, Khalid',
1212 'track': 'Eastside',
1213 'album': 'Eastside',
1214 'release_date': '20180713',
1215 'release_year': 2018,
1216 },
1217 'params': {
1218 'skip_download': True,
1219 },
1220 },
1221 {
1222 # Youtube Music Auto-generated description
1223 # handle youtube music video with release_year and no release_date
1224 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
1225 'info_dict': {
1226 'id': '-hcAI0g-f5M',
1227 'ext': 'mp4',
1228 'title': 'Put It On Me',
1229 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
1230 'upload_date': '20180426',
1231 'uploader': 'Matt Maeson - Topic',
1232 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
1233 'artist': 'Matt Maeson',
1234 'track': 'Put It On Me',
1235 'album': 'The Hearse',
1236 'release_date': None,
1237 'release_year': 2018,
1238 },
1239 'params': {
1240 'skip_download': True,
1241 },
1242 },
1243 {
1244 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
1245 'only_matching': True,
1246 },
1247 {
1248 # invalid -> valid video id redirection
1249 'url': 'DJztXj2GPfl',
1250 'info_dict': {
1251 'id': 'DJztXj2GPfk',
1252 'ext': 'mp4',
1253 'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
1254 'description': 'md5:bf577a41da97918e94fa9798d9228825',
1255 'upload_date': '20090125',
1256 'uploader': 'Prochorowka',
1257 'uploader_id': 'Prochorowka',
1258 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
1259 'artist': 'Panjabi MC',
1260 'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
1261 'album': 'Beware of the Boys (Mundian To Bach Ke)',
1262 },
1263 'params': {
1264 'skip_download': True,
1265 },
1266 },
1267 {
1268 # empty description results in an empty string
1269 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
1270 'info_dict': {
1271 'id': 'x41yOUIvK2k',
1272 'ext': 'mp4',
1273 'title': 'IMG 3456',
1274 'description': '',
1275 'upload_date': '20170613',
1276 'uploader_id': 'ElevageOrVert',
1277 'uploader': 'ElevageOrVert',
1278 },
1279 'params': {
1280 'skip_download': True,
1281 },
1282 },
1283 ]
1284
1285 def __init__(self, *args, **kwargs):
1286 super(YoutubeIE, self).__init__(*args, **kwargs)
1287 self._player_cache = {}
1288
1289 def report_video_info_webpage_download(self, video_id):
1290 """Report attempt to download video info webpage."""
1291 self.to_screen('%s: Downloading video info webpage' % video_id)
1292
1293 def report_information_extraction(self, video_id):
1294 """Report attempt to extract video information."""
1295 self.to_screen('%s: Extracting video information' % video_id)
1296
1297 def report_unavailable_format(self, video_id, format):
1298 """Report extracted video URL."""
1299 self.to_screen('%s: Format %s not available' % (video_id, format))
1300
1301 def report_rtmp_download(self):
1302 """Indicate the download will use the RTMP protocol."""
1303 self.to_screen('RTMP download detected')
1304
1305 def _signature_cache_id(self, example_sig):
1306 """ Return a string representation of a signature """
1307 return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
1308
1309 @classmethod
1310 def _extract_player_info(cls, player_url):
1311 for player_re in cls._PLAYER_INFO_RE:
1312 id_m = re.search(player_re, player_url)
1313 if id_m:
1314 break
1315 else:
1316 raise ExtractorError('Cannot identify player %r' % player_url)
1317 return id_m.group('ext'), id_m.group('id')
1318
1319 def _extract_signature_function(self, video_id, player_url, example_sig):
1320 player_type, player_id = self._extract_player_info(player_url)
1321
1322 # Read from filesystem cache
1323 func_id = '%s_%s_%s' % (
1324 player_type, player_id, self._signature_cache_id(example_sig))
1325 assert os.path.basename(func_id) == func_id
1326
1327 cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
1328 if cache_spec is not None:
1329 return lambda s: ''.join(s[i] for i in cache_spec)
1330
1331 download_note = (
1332 'Downloading player %s' % player_url
1333 if self._downloader.params.get('verbose') else
1334 'Downloading %s player %s' % (player_type, player_id)
1335 )
1336 if player_type == 'js':
1337 code = self._download_webpage(
1338 player_url, video_id,
1339 note=download_note,
1340 errnote='Download of %s failed' % player_url)
1341 res = self._parse_sig_js(code)
1342 elif player_type == 'swf':
1343 urlh = self._request_webpage(
1344 player_url, video_id,
1345 note=download_note,
1346 errnote='Download of %s failed' % player_url)
1347 code = urlh.read()
1348 res = self._parse_sig_swf(code)
1349 else:
1350 assert False, 'Invalid player type %r' % player_type
1351
1352 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1353 cache_res = res(test_string)
1354 cache_spec = [ord(c) for c in cache_res]
1355
1356 self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
1357 return res
1358
1359 def _print_sig_code(self, func, example_sig):
1360 def gen_sig_code(idxs):
1361 def _genslice(start, end, step):
1362 starts = '' if start == 0 else str(start)
1363 ends = (':%d' % (end + step)) if end + step >= 0 else ':'
1364 steps = '' if step == 1 else (':%d' % step)
1365 return 's[%s%s%s]' % (starts, ends, steps)
1366
1367 step = None
1368 # Quelch pyflakes warnings - start will be set when step is set
1369 start = '(Never used)'
1370 for i, prev in zip(idxs[1:], idxs[:-1]):
1371 if step is not None:
1372 if i - prev == step:
1373 continue
1374 yield _genslice(start, prev, step)
1375 step = None
1376 continue
1377 if i - prev in [-1, 1]:
1378 step = i - prev
1379 start = prev
1380 continue
1381 else:
1382 yield 's[%d]' % prev
1383 if step is None:
1384 yield 's[%d]' % i
1385 else:
1386 yield _genslice(start, i, step)
1387
1388 test_string = ''.join(map(compat_chr, range(len(example_sig))))
1389 cache_res = func(test_string)
1390 cache_spec = [ord(c) for c in cache_res]
1391 expr_code = ' + '.join(gen_sig_code(cache_spec))
1392 signature_id_tuple = '(%s)' % (
1393 ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
1394 code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
1395 ' return %s\n') % (signature_id_tuple, expr_code)
1396 self.to_screen('Extracted signature function:\n' + code)
1397
1398 def _parse_sig_js(self, jscode):
1399 funcname = self._search_regex(
1400 (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1401 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1402 r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1403 r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
1404 # Obsolete patterns
1405 r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1406 r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
1407 r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1408 r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1409 r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1410 r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1411 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
1412 r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\('),
1413 jscode, 'Initial JS player signature function name', group='sig')
1414
1415 jsi = JSInterpreter(jscode)
1416 initial_function = jsi.extract_function(funcname)
1417 return lambda s: initial_function([s])
1418
1419 def _parse_sig_swf(self, file_contents):
1420 swfi = SWFInterpreter(file_contents)
1421 TARGET_CLASSNAME = 'SignatureDecipher'
1422 searched_class = swfi.extract_class(TARGET_CLASSNAME)
1423 initial_function = swfi.extract_function(searched_class, 'decipher')
1424 return lambda s: initial_function([s])
1425
1426 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1427 """Turn the encrypted s field into a working signature"""
1428
1429 if player_url is None:
1430 raise ExtractorError('Cannot decrypt signature without player_url')
1431
1432 if player_url.startswith('//'):
1433 player_url = 'https:' + player_url
1434 elif not re.match(r'https?://', player_url):
1435 player_url = compat_urlparse.urljoin(
1436 'https://www.youtube.com', player_url)
1437 try:
1438 player_id = (player_url, self._signature_cache_id(s))
1439 if player_id not in self._player_cache:
1440 func = self._extract_signature_function(
1441 video_id, player_url, s
1442 )
1443 self._player_cache[player_id] = func
1444 func = self._player_cache[player_id]
1445 if self._downloader.params.get('youtube_print_sig_code'):
1446 self._print_sig_code(func, s)
1447 return func(s)
1448 except Exception as e:
1449 tb = traceback.format_exc()
1450 raise ExtractorError(
1451 'Signature extraction failed: ' + tb, cause=e)
1452
1453 def _get_subtitles(self, video_id, webpage):
1454 try:
1455 subs_doc = self._download_xml(
1456 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1457 video_id, note=False)
1458 except ExtractorError as err:
1459 self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
1460 return {}
1461
1462 sub_lang_list = {}
1463 for track in subs_doc.findall('track'):
1464 lang = track.attrib['lang_code']
1465 if lang in sub_lang_list:
1466 continue
1467 sub_formats = []
1468 for ext in self._SUBTITLE_FORMATS:
1469 params = compat_urllib_parse_urlencode({
1470 'lang': lang,
1471 'v': video_id,
1472 'fmt': ext,
1473 'name': track.attrib['name'].encode('utf-8'),
1474 })
1475 sub_formats.append({
1476 'url': 'https://www.youtube.com/api/timedtext?' + params,
1477 'ext': ext,
1478 })
1479 sub_lang_list[lang] = sub_formats
1480 if not sub_lang_list:
1481 self._downloader.report_warning('video doesn\'t have subtitles')
1482 return {}
1483 return sub_lang_list
1484
1485 def _get_ytplayer_config(self, video_id, webpage):
1486 patterns = (
1487 # User data may contain arbitrary character sequences that may affect
1488 # JSON extraction with regex, e.g. when '};' is contained the second
1489 # regex won't capture the whole JSON. Yet working around by trying more
1490 # concrete regex first keeping in mind proper quoted string handling
1491 # to be implemented in future that will replace this workaround (see
1492 # https://github.com/ytdl-org/youtube-dl/issues/7468,
1493 # https://github.com/ytdl-org/youtube-dl/pull/7599)
1494 r';ytplayer\.config\s*=\s*({.+?});ytplayer',
1495 r';ytplayer\.config\s*=\s*({.+?});',
1496 )
1497 config = self._search_regex(
1498 patterns, webpage, 'ytplayer.config', default=None)
1499 if config:
1500 return self._parse_json(
1501 uppercase_escape(config), video_id, fatal=False)
1502
1503 def _get_automatic_captions(self, video_id, webpage):
1504 """We need the webpage for getting the captions url, pass it as an
1505 argument to speed up the process."""
1506 self.to_screen('%s: Looking for automatic captions' % video_id)
1507 player_config = self._get_ytplayer_config(video_id, webpage)
1508 err_msg = 'Couldn\'t find automatic captions for %s' % video_id
1509 if not player_config:
1510 self._downloader.report_warning(err_msg)
1511 return {}
1512 try:
1513 args = player_config['args']
1514 caption_url = args.get('ttsurl')
1515 if caption_url:
1516 timestamp = args['timestamp']
1517 # We get the available subtitles
1518 list_params = compat_urllib_parse_urlencode({
1519 'type': 'list',
1520 'tlangs': 1,
1521 'asrs': 1,
1522 })
1523 list_url = caption_url + '&' + list_params
1524 caption_list = self._download_xml(list_url, video_id)
1525 original_lang_node = caption_list.find('track')
1526 if original_lang_node is None:
1527 self._downloader.report_warning('Video doesn\'t have automatic captions')
1528 return {}
1529 original_lang = original_lang_node.attrib['lang_code']
1530 caption_kind = original_lang_node.attrib.get('kind', '')
1531
1532 sub_lang_list = {}
1533 for lang_node in caption_list.findall('target'):
1534 sub_lang = lang_node.attrib['lang_code']
1535 sub_formats = []
1536 for ext in self._SUBTITLE_FORMATS:
1537 params = compat_urllib_parse_urlencode({
1538 'lang': original_lang,
1539 'tlang': sub_lang,
1540 'fmt': ext,
1541 'ts': timestamp,
1542 'kind': caption_kind,
1543 })
1544 sub_formats.append({
1545 'url': caption_url + '&' + params,
1546 'ext': ext,
1547 })
1548 sub_lang_list[sub_lang] = sub_formats
1549 return sub_lang_list
1550
1551 def make_captions(sub_url, sub_langs):
1552 parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
1553 caption_qs = compat_parse_qs(parsed_sub_url.query)
1554 captions = {}
1555 for sub_lang in sub_langs:
1556 sub_formats = []
1557 for ext in self._SUBTITLE_FORMATS:
1558 caption_qs.update({
1559 'tlang': [sub_lang],
1560 'fmt': [ext],
1561 })
1562 sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
1563 query=compat_urllib_parse_urlencode(caption_qs, True)))
1564 sub_formats.append({
1565 'url': sub_url,
1566 'ext': ext,
1567 })
1568 captions[sub_lang] = sub_formats
1569 return captions
1570
1571 # New captions format as of 22.06.2017
1572 player_response = args.get('player_response')
1573 if player_response and isinstance(player_response, compat_str):
1574 player_response = self._parse_json(
1575 player_response, video_id, fatal=False)
1576 if player_response:
1577 renderer = player_response['captions']['playerCaptionsTracklistRenderer']
1578 base_url = renderer['captionTracks'][0]['baseUrl']
1579 sub_lang_list = []
1580 for lang in renderer['translationLanguages']:
1581 lang_code = lang.get('languageCode')
1582 if lang_code:
1583 sub_lang_list.append(lang_code)
1584 return make_captions(base_url, sub_lang_list)
1585
1586 # Some videos don't provide ttsurl but rather caption_tracks and
1587 # caption_translation_languages (e.g. 20LmZk1hakA)
1588 # Does not used anymore as of 22.06.2017
1589 caption_tracks = args['caption_tracks']
1590 caption_translation_languages = args['caption_translation_languages']
1591 caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
1592 sub_lang_list = []
1593 for lang in caption_translation_languages.split(','):
1594 lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
1595 sub_lang = lang_qs.get('lc', [None])[0]
1596 if sub_lang:
1597 sub_lang_list.append(sub_lang)
1598 return make_captions(caption_url, sub_lang_list)
1599 # An extractor error can be raise by the download process if there are
1600 # no automatic captions but there are subtitles
1601 except (KeyError, IndexError, ExtractorError):
1602 self._downloader.report_warning(err_msg)
1603 return {}
1604
1605 def _mark_watched(self, video_id, video_info, player_response):
1606 playback_url = url_or_none(try_get(
1607 player_response,
1608 lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
1609 video_info, lambda x: x['videostats_playback_base_url'][0]))
1610 if not playback_url:
1611 return
1612 parsed_playback_url = compat_urlparse.urlparse(playback_url)
1613 qs = compat_urlparse.parse_qs(parsed_playback_url.query)
1614
1615 # cpn generation algorithm is reverse engineered from base.js.
1616 # In fact it works even with dummy cpn.
1617 CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
1618 cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
1619
1620 qs.update({
1621 'ver': ['2'],
1622 'cpn': [cpn],
1623 })
1624 playback_url = compat_urlparse.urlunparse(
1625 parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
1626
1627 self._download_webpage(
1628 playback_url, video_id, 'Marking watched',
1629 'Unable to mark watched', fatal=False)
1630
1631 @staticmethod
1632 def _extract_urls(webpage):
1633 # Embedded YouTube player
1634 entries = [
1635 unescapeHTML(mobj.group('url'))
1636 for mobj in re.finditer(r'''(?x)
1637 (?:
1638 <iframe[^>]+?src=|
1639 data-video-url=|
1640 <embed[^>]+?src=|
1641 embedSWF\(?:\s*|
1642 <object[^>]+data=|
1643 new\s+SWFObject\(
1644 )
1645 (["\'])
1646 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1647 (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?)
1648 \1''', webpage)]
1649
1650 # lazyYT YouTube embed
1651 entries.extend(list(map(
1652 unescapeHTML,
1653 re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
1654
1655 # Wordpress "YouTube Video Importer" plugin
1656 matches = re.findall(r'''(?x)<div[^>]+
1657 class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
1658 data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
1659 entries.extend(m[-1] for m in matches)
1660
1661 return entries
1662
1663 @staticmethod
1664 def _extract_url(webpage):
1665 urls = YoutubeIE._extract_urls(webpage)
1666 return urls[0] if urls else None
1667
1668 @classmethod
1669 def extract_id(cls, url):
1670 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
1671 if mobj is None:
1672 raise ExtractorError('Invalid URL: %s' % url)
1673 video_id = mobj.group(2)
1674 return video_id
1675
1676 def _extract_chapters_from_json(self, webpage, video_id, duration):
1677 if not webpage:
1678 return
1679 player = self._parse_json(
1680 self._search_regex(
1681 r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
1682 'player args', default='{}'),
1683 video_id, fatal=False)
1684 if not player or not isinstance(player, dict):
1685 return
1686 watch_next_response = player.get('watch_next_response')
1687 if not isinstance(watch_next_response, compat_str):
1688 return
1689 response = self._parse_json(watch_next_response, video_id, fatal=False)
1690 if not response or not isinstance(response, dict):
1691 return
1692 chapters_list = try_get(
1693 response,
1694 lambda x: x['playerOverlays']
1695 ['playerOverlayRenderer']
1696 ['decoratedPlayerBarRenderer']
1697 ['decoratedPlayerBarRenderer']
1698 ['playerBar']
1699 ['chapteredPlayerBarRenderer']
1700 ['chapters'],
1701 list)
1702 if not chapters_list:
1703 return
1704
1705 def chapter_time(chapter):
1706 return float_or_none(
1707 try_get(
1708 chapter,
1709 lambda x: x['chapterRenderer']['timeRangeStartMillis'],
1710 int),
1711 scale=1000)
1712 chapters = []
1713 for next_num, chapter in enumerate(chapters_list, start=1):
1714 start_time = chapter_time(chapter)
1715 if start_time is None:
1716 continue
1717 end_time = (chapter_time(chapters_list[next_num])
1718 if next_num < len(chapters_list) else duration)
1719 if end_time is None:
1720 continue
1721 title = try_get(
1722 chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
1723 compat_str)
1724 chapters.append({
1725 'start_time': start_time,
1726 'end_time': end_time,
1727 'title': title,
1728 })
1729 return chapters
1730
1731 @staticmethod
1732 def _extract_chapters_from_description(description, duration):
1733 if not description:
1734 return None
1735 chapter_lines = re.findall(
1736 r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
1737 description)
1738 if not chapter_lines:
1739 return None
1740 chapters = []
1741 for next_num, (chapter_line, time_point) in enumerate(
1742 chapter_lines, start=1):
1743 start_time = parse_duration(time_point)
1744 if start_time is None:
1745 continue
1746 if start_time > duration:
1747 break
1748 end_time = (duration if next_num == len(chapter_lines)
1749 else parse_duration(chapter_lines[next_num][1]))
1750 if end_time is None:
1751 continue
1752 if end_time > duration:
1753 end_time = duration
1754 if start_time > end_time:
1755 break
1756 chapter_title = re.sub(
1757 r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
1758 chapter_title = re.sub(r'\s+', ' ', chapter_title)
1759 chapters.append({
1760 'start_time': start_time,
1761 'end_time': end_time,
1762 'title': chapter_title,
1763 })
1764 return chapters
1765
1766 def _extract_chapters(self, webpage, description, video_id, duration):
1767 return (self._extract_chapters_from_json(webpage, video_id, duration)
1768 or self._extract_chapters_from_description(description, duration))
1769
1770 def _real_extract(self, url):
1771 url, smuggled_data = unsmuggle_url(url, {})
1772
1773 proto = (
1774 'http' if self._downloader.params.get('prefer_insecure', False)
1775 else 'https')
1776
1777 start_time = None
1778 end_time = None
1779 parsed_url = compat_urllib_parse_urlparse(url)
1780 for component in [parsed_url.fragment, parsed_url.query]:
1781 query = compat_parse_qs(component)
1782 if start_time is None and 't' in query:
1783 start_time = parse_duration(query['t'][0])
1784 if start_time is None and 'start' in query:
1785 start_time = parse_duration(query['start'][0])
1786 if end_time is None and 'end' in query:
1787 end_time = parse_duration(query['end'][0])
1788
1789 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1790 mobj = re.search(self._NEXT_URL_RE, url)
1791 if mobj:
1792 url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
1793 video_id = self.extract_id(url)
1794
1795 # Get video webpage
1796 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
1797 video_webpage, urlh = self._download_webpage_handle(url, video_id)
1798
1799 qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
1800 video_id = qs.get('v', [None])[0] or video_id
1801
1802 # Attempt to extract SWF player URL
1803 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1804 if mobj is not None:
1805 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1806 else:
1807 player_url = None
1808
1809 dash_mpds = []
1810
1811 def add_dash_mpd(video_info):
1812 dash_mpd = video_info.get('dashmpd')
1813 if dash_mpd and dash_mpd[0] not in dash_mpds:
1814 dash_mpds.append(dash_mpd[0])
1815
1816 def add_dash_mpd_pr(pl_response):
1817 dash_mpd = url_or_none(try_get(
1818 pl_response, lambda x: x['streamingData']['dashManifestUrl'],
1819 compat_str))
1820 if dash_mpd and dash_mpd not in dash_mpds:
1821 dash_mpds.append(dash_mpd)
1822
1823 is_live = None
1824 view_count = None
1825
1826 def extract_view_count(v_info):
1827 return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
1828
1829 def extract_player_response(player_response, video_id):
1830 pl_response = str_or_none(player_response)
1831 if not pl_response:
1832 return
1833 pl_response = self._parse_json(pl_response, video_id, fatal=False)
1834 if isinstance(pl_response, dict):
1835 add_dash_mpd_pr(pl_response)
1836 return pl_response
1837
1838 player_response = {}
1839
1840 # Get video info
1841 video_info = {}
1842 embed_webpage = None
1843 if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
1844 or re.search(r'player-age-gate-content">', video_webpage) is not None):
1845 age_gate = True
1846 # We simulate the access to the video from www.youtube.com/v/{video_id}
1847 # this can be viewed without login into Youtube
1848 url = proto + '://www.youtube.com/embed/%s' % video_id
1849 embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
1850 data = compat_urllib_parse_urlencode({
1851 'video_id': video_id,
1852 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1853 'sts': self._search_regex(
1854 r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
1855 })
1856 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1857 try:
1858 video_info_webpage = self._download_webpage(
1859 video_info_url, video_id,
1860 note='Refetching age-gated info webpage',
1861 errnote='unable to download video info webpage')
1862 except ExtractorError:
1863 video_info_webpage = None
1864 if video_info_webpage:
1865 video_info = compat_parse_qs(video_info_webpage)
1866 pl_response = video_info.get('player_response', [None])[0]
1867 player_response = extract_player_response(pl_response, video_id)
1868 add_dash_mpd(video_info)
1869 view_count = extract_view_count(video_info)
1870 else:
1871 age_gate = False
1872 # Try looking directly into the video webpage
1873 ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
1874 if ytplayer_config:
1875 args = ytplayer_config['args']
1876 if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
1877 # Convert to the same format returned by compat_parse_qs
1878 video_info = dict((k, [v]) for k, v in args.items())
1879 add_dash_mpd(video_info)
1880 # Rental video is not rented but preview is available (e.g.
1881 # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
1882 # https://github.com/ytdl-org/youtube-dl/issues/10532)
1883 if not video_info and args.get('ypc_vid'):
1884 return self.url_result(
1885 args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
1886 if args.get('livestream') == '1' or args.get('live_playback') == 1:
1887 is_live = True
1888 if not player_response:
1889 player_response = extract_player_response(args.get('player_response'), video_id)
1890 if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
1891 add_dash_mpd_pr(player_response)
1892
1893 def extract_unavailable_message():
1894 messages = []
1895 for tag, kind in (('h1', 'message'), ('div', 'submessage')):
1896 msg = self._html_search_regex(
1897 r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
1898 video_webpage, 'unavailable %s' % kind, default=None)
1899 if msg:
1900 messages.append(msg)
1901 if messages:
1902 return '\n'.join(messages)
1903
1904 if not video_info and not player_response:
1905 unavailable_message = extract_unavailable_message()
1906 if not unavailable_message:
1907 unavailable_message = 'Unable to extract video data'
1908 raise ExtractorError(
1909 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
1910
1911 if not isinstance(video_info, dict):
1912 video_info = {}
1913
1914 video_details = try_get(
1915 player_response, lambda x: x['videoDetails'], dict) or {}
1916
1917 microformat = try_get(
1918 player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
1919
1920 video_title = video_info.get('title', [None])[0] or video_details.get('title')
1921 if not video_title:
1922 self._downloader.report_warning('Unable to extract video title')
1923 video_title = '_'
1924
1925 description_original = video_description = get_element_by_id("eow-description", video_webpage)
1926 if video_description:
1927
1928 def replace_url(m):
1929 redir_url = compat_urlparse.urljoin(url, m.group(1))
1930 parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
1931 if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
1932 qs = compat_parse_qs(parsed_redir_url.query)
1933 q = qs.get('q')
1934 if q and q[0]:
1935 return q[0]
1936 return redir_url
1937
1938 description_original = video_description = re.sub(r'''(?x)
1939 <a\s+
1940 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1941 (?:title|href)="([^"]+)"\s+
1942 (?:[a-zA-Z-]+="[^"]*"\s+)*?
1943 class="[^"]*"[^>]*>
1944 [^<]+\.{3}\s*
1945 </a>
1946 ''', replace_url, video_description)
1947 video_description = clean_html(video_description)
1948 else:
1949 video_description = video_details.get('shortDescription')
1950 if video_description is None:
1951 video_description = self._html_search_meta('description', video_webpage)
1952
1953 if not smuggled_data.get('force_singlefeed', False):
1954 if not self._downloader.params.get('noplaylist'):
1955 multifeed_metadata_list = try_get(
1956 player_response,
1957 lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
1958 compat_str) or try_get(
1959 video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
1960 if multifeed_metadata_list:
1961 entries = []
1962 feed_ids = []
1963 for feed in multifeed_metadata_list.split(','):
1964 # Unquote should take place before split on comma (,) since textual
1965 # fields may contain comma as well (see
1966 # https://github.com/ytdl-org/youtube-dl/issues/8536)
1967 feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
1968
1969 def feed_entry(name):
1970 return try_get(feed_data, lambda x: x[name][0], compat_str)
1971
1972 feed_id = feed_entry('id')
1973 if not feed_id:
1974 continue
1975 feed_title = feed_entry('title')
1976 title = video_title
1977 if feed_title:
1978 title += ' (%s)' % feed_title
1979 entries.append({
1980 '_type': 'url_transparent',
1981 'ie_key': 'Youtube',
1982 'url': smuggle_url(
1983 '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
1984 {'force_singlefeed': True}),
1985 'title': title,
1986 })
1987 feed_ids.append(feed_id)
1988 self.to_screen(
1989 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
1990 % (', '.join(feed_ids), video_id))
1991 return self.playlist_result(entries, video_id, video_title, video_description)
1992 else:
1993 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1994
1995 if view_count is None:
1996 view_count = extract_view_count(video_info)
1997 if view_count is None and video_details:
1998 view_count = int_or_none(video_details.get('viewCount'))
1999 if view_count is None and microformat:
2000 view_count = int_or_none(microformat.get('viewCount'))
2001
2002 if is_live is None:
2003 is_live = bool_or_none(video_details.get('isLive'))
2004
2005 # Check for "rental" videos
2006 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2007 raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
2008
2009 def _extract_filesize(media_url):
2010 return int_or_none(self._search_regex(
2011 r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
2012
2013 streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
2014 streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
2015
2016 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
2017 self.report_rtmp_download()
2018 formats = [{
2019 'format_id': '_rtmp',
2020 'protocol': 'rtmp',
2021 'url': video_info['conn'][0],
2022 'player_url': player_url,
2023 }]
2024 elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
2025 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
2026 if 'rtmpe%3Dyes' in encoded_url_map:
2027 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
2028 formats = []
2029 formats_spec = {}
2030 fmt_list = video_info.get('fmt_list', [''])[0]
2031 if fmt_list:
2032 for fmt in fmt_list.split(','):
2033 spec = fmt.split('/')
2034 if len(spec) > 1:
2035 width_height = spec[1].split('x')
2036 if len(width_height) == 2:
2037 formats_spec[spec[0]] = {
2038 'resolution': spec[1],
2039 'width': int_or_none(width_height[0]),
2040 'height': int_or_none(width_height[1]),
2041 }
2042 for fmt in streaming_formats:
2043 itag = str_or_none(fmt.get('itag'))
2044 if not itag:
2045 continue
2046 quality = fmt.get('quality')
2047 quality_label = fmt.get('qualityLabel') or quality
2048 formats_spec[itag] = {
2049 'asr': int_or_none(fmt.get('audioSampleRate')),
2050 'filesize': int_or_none(fmt.get('contentLength')),
2051 'format_note': quality_label,
2052 'fps': int_or_none(fmt.get('fps')),
2053 'height': int_or_none(fmt.get('height')),
2054 # bitrate for itag 43 is always 2147483647
2055 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
2056 'width': int_or_none(fmt.get('width')),
2057 }
2058
2059 for fmt in streaming_formats:
2060 if fmt.get('drmFamilies') or fmt.get('drm_families'):
2061 continue
2062 url = url_or_none(fmt.get('url'))
2063
2064 if not url:
2065 cipher = fmt.get('cipher') or fmt.get('signatureCipher')
2066 if not cipher:
2067 continue
2068 url_data = compat_parse_qs(cipher)
2069 url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
2070 if not url:
2071 continue
2072 else:
2073 cipher = None
2074 url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2075
2076 stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
2077 # Unsupported FORMAT_STREAM_TYPE_OTF
2078 if stream_type == 3:
2079 continue
2080
2081 format_id = fmt.get('itag') or url_data['itag'][0]
2082 if not format_id:
2083 continue
2084 format_id = compat_str(format_id)
2085
2086 if cipher:
2087 if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
2088 ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
2089 jsplayer_url_json = self._search_regex(
2090 ASSETS_RE,
2091 embed_webpage if age_gate else video_webpage,
2092 'JS player URL (1)', default=None)
2093 if not jsplayer_url_json and not age_gate:
2094 # We need the embed website after all
2095 if embed_webpage is None:
2096 embed_url = proto + '://www.youtube.com/embed/%s' % video_id
2097 embed_webpage = self._download_webpage(
2098 embed_url, video_id, 'Downloading embed webpage')
2099 jsplayer_url_json = self._search_regex(
2100 ASSETS_RE, embed_webpage, 'JS player URL')
2101
2102 player_url = json.loads(jsplayer_url_json)
2103 if player_url is None:
2104 player_url_json = self._search_regex(
2105 r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
2106 video_webpage, 'age gate player URL')
2107 player_url = json.loads(player_url_json)
2108
2109 if 'sig' in url_data:
2110 url += '&signature=' + url_data['sig'][0]
2111 elif 's' in url_data:
2112 encrypted_sig = url_data['s'][0]
2113
2114 if self._downloader.params.get('verbose'):
2115 if player_url is None:
2116 player_desc = 'unknown'
2117 else:
2118 player_type, player_version = self._extract_player_info(player_url)
2119 player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
2120 parts_sizes = self._signature_cache_id(encrypted_sig)
2121 self.to_screen('{%s} signature length %s, %s' %
2122 (format_id, parts_sizes, player_desc))
2123
2124 signature = self._decrypt_signature(
2125 encrypted_sig, video_id, player_url, age_gate)
2126 sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
2127 url += '&%s=%s' % (sp, signature)
2128 if 'ratebypass' not in url:
2129 url += '&ratebypass=yes'
2130
2131 dct = {
2132 'format_id': format_id,
2133 'url': url,
2134 'player_url': player_url,
2135 }
2136 if format_id in self._formats:
2137 dct.update(self._formats[format_id])
2138 if format_id in formats_spec:
2139 dct.update(formats_spec[format_id])
2140
2141 # Some itags are not included in DASH manifest thus corresponding formats will
2142 # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
2143 # Trying to extract metadata from url_encoded_fmt_stream_map entry.
2144 mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
2145 width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
2146
2147 if width is None:
2148 width = int_or_none(fmt.get('width'))
2149 if height is None:
2150 height = int_or_none(fmt.get('height'))
2151
2152 filesize = int_or_none(url_data.get(
2153 'clen', [None])[0]) or _extract_filesize(url)
2154
2155 quality = url_data.get('quality', [None])[0] or fmt.get('quality')
2156 quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
2157
2158 tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
2159 or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
2160 fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
2161
2162 more_fields = {
2163 'filesize': filesize,
2164 'tbr': tbr,
2165 'width': width,
2166 'height': height,
2167 'fps': fps,
2168 'format_note': quality_label or quality,
2169 }
2170 for key, value in more_fields.items():
2171 if value:
2172 dct[key] = value
2173 type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
2174 if type_:
2175 type_split = type_.split(';')
2176 kind_ext = type_split[0].split('/')
2177 if len(kind_ext) == 2:
2178 kind, _ = kind_ext
2179 dct['ext'] = mimetype2ext(type_split[0])
2180 if kind in ('audio', 'video'):
2181 codecs = None
2182 for mobj in re.finditer(
2183 r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
2184 if mobj.group('key') == 'codecs':
2185 codecs = mobj.group('val')
2186 break
2187 if codecs:
2188 dct.update(parse_codecs(codecs))
2189 if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
2190 dct['downloader_options'] = {
2191 # Youtube throttles chunks >~10M
2192 'http_chunk_size': 10485760,
2193 }
2194 formats.append(dct)
2195 else:
2196 manifest_url = (
2197 url_or_none(try_get(
2198 player_response,
2199 lambda x: x['streamingData']['hlsManifestUrl'],
2200 compat_str))
2201 or url_or_none(try_get(
2202 video_info, lambda x: x['hlsvp'][0], compat_str)))
2203 if manifest_url:
2204 formats = []
2205 m3u8_formats = self._extract_m3u8_formats(
2206 manifest_url, video_id, 'mp4', fatal=False)
2207 for a_format in m3u8_formats:
2208 itag = self._search_regex(
2209 r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
2210 if itag:
2211 a_format['format_id'] = itag
2212 if itag in self._formats:
2213 dct = self._formats[itag].copy()
2214 dct.update(a_format)
2215 a_format = dct
2216 a_format['player_url'] = player_url
2217 # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
2218 a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
2219 formats.append(a_format)
2220 else:
2221 error_message = extract_unavailable_message()
2222 if not error_message:
2223 error_message = clean_html(try_get(
2224 player_response, lambda x: x['playabilityStatus']['reason'],
2225 compat_str))
2226 if not error_message:
2227 error_message = clean_html(
2228 try_get(video_info, lambda x: x['reason'][0], compat_str))
2229 if error_message:
2230 raise ExtractorError(error_message, expected=True)
2231 raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
2232
2233 # uploader
2234 video_uploader = try_get(
2235 video_info, lambda x: x['author'][0],
2236 compat_str) or str_or_none(video_details.get('author'))
2237 if video_uploader:
2238 video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
2239 else:
2240 self._downloader.report_warning('unable to extract uploader name')
2241
2242 # uploader_id
2243 video_uploader_id = None
2244 video_uploader_url = None
2245 mobj = re.search(
2246 r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
2247 video_webpage)
2248 if mobj is not None:
2249 video_uploader_id = mobj.group('uploader_id')
2250 video_uploader_url = mobj.group('uploader_url')
2251 else:
2252 owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
2253 if owner_profile_url:
2254 video_uploader_id = self._search_regex(
2255 r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
2256 default=None)
2257 video_uploader_url = owner_profile_url
2258
2259 channel_id = (
2260 str_or_none(video_details.get('channelId'))
2261 or self._html_search_meta(
2262 'channelId', video_webpage, 'channel id', default=None)
2263 or self._search_regex(
2264 r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
2265 video_webpage, 'channel id', default=None, group='id'))
2266 channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
2267
2268 thumbnails = []
2269 thumbnails_list = try_get(
2270 video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
2271 for t in thumbnails_list:
2272 if not isinstance(t, dict):
2273 continue
2274 thumbnail_url = url_or_none(t.get('url'))
2275 if not thumbnail_url:
2276 continue
2277 thumbnails.append({
2278 'url': thumbnail_url,
2279 'width': int_or_none(t.get('width')),
2280 'height': int_or_none(t.get('height')),
2281 })
2282
2283 if not thumbnails:
2284 video_thumbnail = None
2285 # We try first to get a high quality image:
2286 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
2287 video_webpage, re.DOTALL)
2288 if m_thumb is not None:
2289 video_thumbnail = m_thumb.group(1)
2290 thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
2291 if thumbnail_url:
2292 video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
2293 if video_thumbnail:
2294 thumbnails.append({'url': video_thumbnail})
2295
2296 # upload date
2297 upload_date = self._html_search_meta(
2298 'datePublished', video_webpage, 'upload date', default=None)
2299 if not upload_date:
2300 upload_date = self._search_regex(
2301 [r'(?s)id="eow-date.*?>(.*?)</span>',
2302 r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
2303 video_webpage, 'upload date', default=None)
2304 if not upload_date:
2305 upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
2306 upload_date = unified_strdate(upload_date)
2307
2308 video_license = self._html_search_regex(
2309 r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
2310 video_webpage, 'license', default=None)
2311
2312 m_music = re.search(
2313 r'''(?x)
2314 <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
2315 <ul[^>]*>\s*
2316 <li>(?P<title>.+?)
2317 by (?P<creator>.+?)
2318 (?:
2319 \(.+?\)|
2320 <a[^>]*
2321 (?:
2322 \bhref=["\']/red[^>]*>| # drop possible
2323 >\s*Listen ad-free with YouTube Red # YouTube Red ad
2324 )
2325 .*?
2326 )?</li
2327 ''',
2328 video_webpage)
2329 if m_music:
2330 video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
2331 video_creator = clean_html(m_music.group('creator'))
2332 else:
2333 video_alt_title = video_creator = None
2334
2335 def extract_meta(field):
2336 return self._html_search_regex(
2337 r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
2338 video_webpage, field, default=None)
2339
2340 track = extract_meta('Song')
2341 artist = extract_meta('Artist')
2342 album = extract_meta('Album')
2343
2344 # Youtube Music Auto-generated description
2345 release_date = release_year = None
2346 if video_description:
2347 mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
2348 if mobj:
2349 if not track:
2350 track = mobj.group('track').strip()
2351 if not artist:
2352 artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
2353 if not album:
2354 album = mobj.group('album'.strip())
2355 release_year = mobj.group('release_year')
2356 release_date = mobj.group('release_date')
2357 if release_date:
2358 release_date = release_date.replace('-', '')
2359 if not release_year:
2360 release_year = int(release_date[:4])
2361 if release_year:
2362 release_year = int(release_year)
2363
2364 m_episode = re.search(
2365 r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
2366 video_webpage)
2367 if m_episode:
2368 series = unescapeHTML(m_episode.group('series'))
2369 season_number = int(m_episode.group('season'))
2370 episode_number = int(m_episode.group('episode'))
2371 else:
2372 series = season_number = episode_number = None
2373
2374 m_cat_container = self._search_regex(
2375 r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
2376 video_webpage, 'categories', default=None)
2377 category = None
2378 if m_cat_container:
2379 category = self._html_search_regex(
2380 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
2381 default=None)
2382 if not category:
2383 category = try_get(
2384 microformat, lambda x: x['category'], compat_str)
2385 video_categories = None if category is None else [category]
2386
2387 video_tags = [
2388 unescapeHTML(m.group('content'))
2389 for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
2390 if not video_tags:
2391 video_tags = try_get(video_details, lambda x: x['keywords'], list)
2392
2393 def _extract_count(count_name):
2394 return str_to_int(self._search_regex(
2395 r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
2396 % re.escape(count_name),
2397 video_webpage, count_name, default=None))
2398
2399 like_count = _extract_count('like')
2400 dislike_count = _extract_count('dislike')
2401
2402 if view_count is None:
2403 view_count = str_to_int(self._search_regex(
2404 r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
2405 'view count', default=None))
2406
2407 average_rating = (
2408 float_or_none(video_details.get('averageRating'))
2409 or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
2410
2411 # subtitles
2412 video_subtitles = self.extract_subtitles(video_id, video_webpage)
2413 automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
2414
2415 video_duration = try_get(
2416 video_info, lambda x: int_or_none(x['length_seconds'][0]))
2417 if not video_duration:
2418 video_duration = int_or_none(video_details.get('lengthSeconds'))
2419 if not video_duration:
2420 video_duration = parse_duration(self._html_search_meta(
2421 'duration', video_webpage, 'video duration'))
2422
2423 # annotations
2424 video_annotations = None
2425 if self._downloader.params.get('writeannotations', False):
2426 xsrf_token = self._search_regex(
2427 r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
2428 video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
2429 invideo_url = try_get(
2430 player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
2431 if xsrf_token and invideo_url:
2432 xsrf_field_name = self._search_regex(
2433 r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
2434 video_webpage, 'xsrf field name',
2435 group='xsrf_field_name', default='session_token')
2436 video_annotations = self._download_webpage(
2437 self._proto_relative_url(invideo_url),
2438 video_id, note='Downloading annotations',
2439 errnote='Unable to download video annotations', fatal=False,
2440 data=urlencode_postdata({xsrf_field_name: xsrf_token}))
2441
2442 chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
2443
2444 # Look for the DASH manifest
2445 if self._downloader.params.get('youtube_include_dash_manifest', True):
2446 dash_mpd_fatal = True
2447 for mpd_url in dash_mpds:
2448 dash_formats = {}
2449 try:
2450 def decrypt_sig(mobj):
2451 s = mobj.group(1)
2452 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
2453 return '/signature/%s' % dec_s
2454
2455 mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
2456
2457 for df in self._extract_mpd_formats(
2458 mpd_url, video_id, fatal=dash_mpd_fatal,
2459 formats_dict=self._formats):
2460 if not df.get('filesize'):
2461 df['filesize'] = _extract_filesize(df['url'])
2462 # Do not overwrite DASH format found in some previous DASH manifest
2463 if df['format_id'] not in dash_formats:
2464 dash_formats[df['format_id']] = df
2465 # Additional DASH manifests may end up in HTTP Error 403 therefore
2466 # allow them to fail without bug report message if we already have
2467 # some DASH manifest succeeded. This is temporary workaround to reduce
2468 # burst of bug reports until we figure out the reason and whether it
2469 # can be fixed at all.
2470 dash_mpd_fatal = False
2471 except (ExtractorError, KeyError) as e:
2472 self.report_warning(
2473 'Skipping DASH manifest: %r' % e, video_id)
2474 if dash_formats:
2475 # Remove the formats we found through non-DASH, they
2476 # contain less info and it can be wrong, because we use
2477 # fixed values (for example the resolution). See
2478 # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
2479 # example.
2480 formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
2481 formats.extend(dash_formats.values())
2482
2483 # Check for malformed aspect ratio
2484 stretched_m = re.search(
2485 r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
2486 video_webpage)
2487 if stretched_m:
2488 w = float(stretched_m.group('w'))
2489 h = float(stretched_m.group('h'))
2490 # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
2491 # We will only process correct ratios.
2492 if w > 0 and h > 0:
2493 ratio = w / h
2494 for f in formats:
2495 if f.get('vcodec') != 'none':
2496 f['stretched_ratio'] = ratio
2497
2498 if not formats:
2499 if 'reason' in video_info:
2500 if 'The uploader has not made this video available in your country.' in video_info['reason']:
2501 regions_allowed = self._html_search_meta(
2502 'regionsAllowed', video_webpage, default=None)
2503 countries = regions_allowed.split(',') if regions_allowed else None
2504 self.raise_geo_restricted(
2505 msg=video_info['reason'][0], countries=countries)
2506 reason = video_info['reason'][0]
2507 if 'Invalid parameters' in reason:
2508 unavailable_message = extract_unavailable_message()
2509 if unavailable_message:
2510 reason = unavailable_message
2511 raise ExtractorError(
2512 'YouTube said: %s' % reason,
2513 expected=True, video_id=video_id)
2514 if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
2515 raise ExtractorError('This video is DRM protected.', expected=True)
2516
2517 self._sort_formats(formats)
2518
2519 self.mark_watched(video_id, video_info, player_response)
2520
2521 return {
2522 'id': video_id,
2523 'uploader': video_uploader,
2524 'uploader_id': video_uploader_id,
2525 'uploader_url': video_uploader_url,
2526 'channel_id': channel_id,
2527 'channel_url': channel_url,
2528 'upload_date': upload_date,
2529 'license': video_license,
2530 'creator': video_creator or artist,
2531 'title': video_title,
2532 'alt_title': video_alt_title or track,
2533 'thumbnails': thumbnails,
2534 'description': video_description,
2535 'categories': video_categories,
2536 'tags': video_tags,
2537 'subtitles': video_subtitles,
2538 'automatic_captions': automatic_captions,
2539 'duration': video_duration,
2540 'age_limit': 18 if age_gate else 0,
2541 'annotations': video_annotations,
2542 'chapters': chapters,
2543 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
2544 'view_count': view_count,
2545 'like_count': like_count,
2546 'dislike_count': dislike_count,
2547 'average_rating': average_rating,
2548 'formats': formats,
2549 'is_live': is_live,
2550 'start_time': start_time,
2551 'end_time': end_time,
2552 'series': series,
2553 'season_number': season_number,
2554 'episode_number': episode_number,
2555 'track': track,
2556 'artist': artist,
2557 'album': album,
2558 'release_date': release_date,
2559 'release_year': release_year,
2560 }
2561
2562
2563class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
2564 IE_DESC = 'YouTube.com playlists'
2565 _VALID_URL = r"""(?x)(?:
2566 (?:https?://)?
2567 (?:\w+\.)?
2568 (?:
2569 (?:
2570 youtube(?:kids)?\.com|
2571 invidio\.us
2572 )
2573 /
2574 (?:
2575 (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
2576 \? (?:.*?[&;])*? (?:p|a|list)=
2577 | p/
2578 )|
2579 youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
2580 )
2581 (
2582 (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
2583 # Top tracks, they can also include dots
2584 |(?:MC)[\w\.]*
2585 )
2586 .*
2587 |
2588 (%(playlist_id)s)
2589 )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
2590 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
2591 _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
2592 _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
2593 IE_NAME = 'youtube:playlist'
2594 _TESTS = [{
2595 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2596 'info_dict': {
2597 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2598 'uploader': 'Sergey M.',
2599 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
2600 'title': 'youtube-dl public playlist',
2601 },
2602 'playlist_count': 1,
2603 }, {
2604 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2605 'info_dict': {
2606 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
2607 'uploader': 'Sergey M.',
2608 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
2609 'title': 'youtube-dl empty playlist',
2610 },
2611 'playlist_count': 0,
2612 }, {
2613 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
2614 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2615 'info_dict': {
2616 'title': '29C3: Not my department',
2617 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
2618 'uploader': 'Christiaan008',
2619 'uploader_id': 'ChRiStIaAn008',
2620 },
2621 'playlist_count': 96,
2622 }, {
2623 'note': 'issue #673',
2624 'url': 'PLBB231211A4F62143',
2625 'info_dict': {
2626 'title': '[OLD]Team Fortress 2 (Class-based LP)',
2627 'id': 'PLBB231211A4F62143',
2628 'uploader': 'Wickydoo',
2629 'uploader_id': 'Wickydoo',
2630 },
2631 'playlist_mincount': 26,
2632 }, {
2633 'note': 'Large playlist',
2634 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
2635 'info_dict': {
2636 'title': 'Uploads from Cauchemar',
2637 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
2638 'uploader': 'Cauchemar',
2639 'uploader_id': 'Cauchemar89',
2640 },
2641 'playlist_mincount': 799,
2642 }, {
2643 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2644 'info_dict': {
2645 'title': 'YDL_safe_search',
2646 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
2647 },
2648 'playlist_count': 2,
2649 'skip': 'This playlist is private',
2650 }, {
2651 'note': 'embedded',
2652 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2653 'playlist_count': 4,
2654 'info_dict': {
2655 'title': 'JODA15',
2656 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
2657 'uploader': 'milan',
2658 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
2659 }
2660 }, {
2661 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2662 'playlist_mincount': 485,
2663 'info_dict': {
2664 'title': '2018 Chinese New Singles (11/6 updated)',
2665 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
2666 'uploader': 'LBK',
2667 'uploader_id': 'sdragonfang',
2668 }
2669 }, {
2670 'note': 'Embedded SWF player',
2671 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
2672 'playlist_count': 4,
2673 'info_dict': {
2674 'title': 'JODA7',
2675 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
2676 },
2677 'skip': 'This playlist does not exist',
2678 }, {
2679 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
2680 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
2681 'info_dict': {
2682 'title': 'Uploads from Interstellar Movie',
2683 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
2684 'uploader': 'Interstellar Movie',
2685 'uploader_id': 'InterstellarMovie1',
2686 },
2687 'playlist_mincount': 21,
2688 }, {
2689 # Playlist URL that does not actually serve a playlist
2690 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
2691 'info_dict': {
2692 'id': 'FqZTN594JQw',
2693 'ext': 'webm',
2694 'title': "Smiley's People 01 detective, Adventure Series, Action",
2695 'uploader': 'STREEM',
2696 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng',
2697 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng',
2698 'upload_date': '20150526',
2699 'license': 'Standard YouTube License',
2700 'description': 'md5:507cdcb5a49ac0da37a920ece610be80',
2701 'categories': ['People & Blogs'],
2702 'tags': list,
2703 'view_count': int,
2704 'like_count': int,
2705 'dislike_count': int,
2706 },
2707 'params': {
2708 'skip_download': True,
2709 },
2710 'skip': 'This video is not available.',
2711 'add_ie': [YoutubeIE.ie_key()],
2712 }, {
2713 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
2714 'info_dict': {
2715 'id': 'yeWKywCrFtk',
2716 'ext': 'mp4',
2717 'title': 'Small Scale Baler and Braiding Rugs',
2718 'uploader': 'Backus-Page House Museum',
2719 'uploader_id': 'backuspagemuseum',
2720 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
2721 'upload_date': '20161008',
2722 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
2723 'categories': ['Nonprofits & Activism'],
2724 'tags': list,
2725 'like_count': int,
2726 'dislike_count': int,
2727 },
2728 'params': {
2729 'noplaylist': True,
2730 'skip_download': True,
2731 },
2732 }, {
2733 # https://github.com/ytdl-org/youtube-dl/issues/21844
2734 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2735 'info_dict': {
2736 'title': 'Data Analysis with Dr Mike Pound',
2737 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
2738 'uploader_id': 'Computerphile',
2739 'uploader': 'Computerphile',
2740 },
2741 'playlist_mincount': 11,
2742 }, {
2743 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
2744 'only_matching': True,
2745 }, {
2746 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
2747 'only_matching': True,
2748 }, {
2749 # music album playlist
2750 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
2751 'only_matching': True,
2752 }, {
2753 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
2754 'only_matching': True,
2755 }, {
2756 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
2757 'only_matching': True,
2758 }]
2759
2760 def _real_initialize(self):
2761 self._login()
2762
2763 def extract_videos_from_page(self, page):
2764 ids_in_page = []
2765 titles_in_page = []
2766
2767 for item in re.findall(
2768 r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
2769 attrs = extract_attributes(item)
2770 video_id = attrs['data-video-id']
2771 video_title = unescapeHTML(attrs.get('data-title'))
2772 if video_title:
2773 video_title = video_title.strip()
2774 ids_in_page.append(video_id)
2775 titles_in_page.append(video_title)
2776
2777 # Fallback with old _VIDEO_RE
2778 self.extract_videos_from_page_impl(
2779 self._VIDEO_RE, page, ids_in_page, titles_in_page)
2780
2781 # Relaxed fallbacks
2782 self.extract_videos_from_page_impl(
2783 r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
2784 ids_in_page, titles_in_page)
2785 self.extract_videos_from_page_impl(
2786 r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
2787 ids_in_page, titles_in_page)
2788
2789 return zip(ids_in_page, titles_in_page)
2790
2791 def _extract_mix(self, playlist_id):
2792 # The mixes are generated from a single video
2793 # the id of the playlist is just 'RD' + video_id
2794 ids = []
2795 last_id = playlist_id[-11:]
2796 for n in itertools.count(1):
2797 url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
2798 webpage = self._download_webpage(
2799 url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
2800 new_ids = orderedSet(re.findall(
2801 r'''(?xs)data-video-username=".*?".*?
2802 href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),
2803 webpage))
2804 # Fetch new pages until all the videos are repeated, it seems that
2805 # there are always 51 unique videos.
2806 new_ids = [_id for _id in new_ids if _id not in ids]
2807 if not new_ids:
2808 break
2809 ids.extend(new_ids)
2810 last_id = ids[-1]
2811
2812 url_results = self._ids_to_results(ids)
2813
2814 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
2815 title_span = (
2816 search_title('playlist-title')
2817 or search_title('title long-title')
2818 or search_title('title'))
2819 title = clean_html(title_span)
2820
2821 return self.playlist_result(url_results, playlist_id, title)
2822
2823 def _extract_playlist(self, playlist_id):
2824 url = self._TEMPLATE_URL % playlist_id
2825 page = self._download_webpage(url, playlist_id)
2826
2827 # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
2828 for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
2829 match = match.strip()
2830 # Check if the playlist exists or is private
2831 mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
2832 if mobj:
2833 reason = mobj.group('reason')
2834 message = 'This playlist %s' % reason
2835 if 'private' in reason:
2836 message += ', use --username or --netrc to access it'
2837 message += '.'
2838 raise ExtractorError(message, expected=True)
2839 elif re.match(r'[^<]*Invalid parameters[^<]*', match):
2840 raise ExtractorError(
2841 'Invalid parameters. Maybe URL is incorrect.',
2842 expected=True)
2843 elif re.match(r'[^<]*Choose your language[^<]*', match):
2844 continue
2845 else:
2846 self.report_warning('Youtube gives an alert message: ' + match)
2847
2848 playlist_title = self._html_search_regex(
2849 r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
2850 page, 'title', default=None)
2851
2852 _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
2853 uploader = self._html_search_regex(
2854 r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
2855 page, 'uploader', default=None)
2856 mobj = re.search(
2857 r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
2858 page)
2859 if mobj:
2860 uploader_id = mobj.group('uploader_id')
2861 uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
2862 else:
2863 uploader_id = uploader_url = None
2864
2865 has_videos = True
2866
2867 if not playlist_title:
2868 try:
2869 # Some playlist URLs don't actually serve a playlist (e.g.
2870 # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
2871 next(self._entries(page, playlist_id))
2872 except StopIteration:
2873 has_videos = False
2874
2875 playlist = self.playlist_result(
2876 self._entries(page, playlist_id), playlist_id, playlist_title)
2877 playlist.update({
2878 'uploader': uploader,
2879 'uploader_id': uploader_id,
2880 'uploader_url': uploader_url,
2881 })
2882
2883 return has_videos, playlist
2884
2885 def _check_download_just_video(self, url, playlist_id):
2886 # Check if it's a video-specific URL
2887 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
2888 video_id = query_dict.get('v', [None])[0] or self._search_regex(
2889 r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
2890 'video id', default=None)
2891 if video_id:
2892 if self._downloader.params.get('noplaylist'):
2893 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
2894 return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
2895 else:
2896 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
2897 return video_id, None
2898 return None, None
2899
2900 def _real_extract(self, url):
2901 # Extract playlist id
2902 mobj = re.match(self._VALID_URL, url)
2903 if mobj is None:
2904 raise ExtractorError('Invalid URL: %s' % url)
2905 playlist_id = mobj.group(1) or mobj.group(2)
2906
2907 video_id, video = self._check_download_just_video(url, playlist_id)
2908 if video:
2909 return video
2910
2911 if playlist_id.startswith(('RD', 'UL', 'PU')):
2912 # Mixes require a custom extraction process
2913 return self._extract_mix(playlist_id)
2914
2915 has_videos, playlist = self._extract_playlist(playlist_id)
2916 if has_videos or not video_id:
2917 return playlist
2918
2919 # Some playlist URLs don't actually serve a playlist (see
2920 # https://github.com/ytdl-org/youtube-dl/issues/10537).
2921 # Fallback to plain video extraction if there is a video id
2922 # along with playlist id.
2923 return self.url_result(video_id, 'Youtube', video_id=video_id)
2924
2925
2926class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
2927 IE_DESC = 'YouTube.com channels'
2928 _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
2929 _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
2930 _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
2931 IE_NAME = 'youtube:channel'
2932 _TESTS = [{
2933 'note': 'paginated channel',
2934 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
2935 'playlist_mincount': 91,
2936 'info_dict': {
2937 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
2938 'title': 'Uploads from lex will',
2939 'uploader': 'lex will',
2940 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
2941 }
2942 }, {
2943 'note': 'Age restricted channel',
2944 # from https://www.youtube.com/user/DeusExOfficial
2945 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
2946 'playlist_mincount': 64,
2947 'info_dict': {
2948 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
2949 'title': 'Uploads from Deus Ex',
2950 'uploader': 'Deus Ex',
2951 'uploader_id': 'DeusExOfficial',
2952 },
2953 }, {
2954 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
2955 'only_matching': True,
2956 }, {
2957 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
2958 'only_matching': True,
2959 }]
2960
2961 @classmethod
2962 def suitable(cls, url):
2963 return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
2964 else super(YoutubeChannelIE, cls).suitable(url))
2965
2966 def _build_template_url(self, url, channel_id):
2967 return self._TEMPLATE_URL % channel_id
2968
2969 def _real_extract(self, url):
2970 channel_id = self._match_id(url)
2971
2972 url = self._build_template_url(url, channel_id)
2973
2974 # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
2975 # Workaround by extracting as a playlist if managed to obtain channel playlist URL
2976 # otherwise fallback on channel by page extraction
2977 channel_page = self._download_webpage(
2978 url + '?view=57', channel_id,
2979 'Downloading channel page', fatal=False)
2980 if channel_page is False:
2981 channel_playlist_id = False
2982 else:
2983 channel_playlist_id = self._html_search_meta(
2984 'channelId', channel_page, 'channel id', default=None)
2985 if not channel_playlist_id:
2986 channel_url = self._html_search_meta(
2987 ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
2988 channel_page, 'channel url', default=None)
2989 if channel_url:
2990 channel_playlist_id = self._search_regex(
2991 r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
2992 channel_url, 'channel id', default=None)
2993 if channel_playlist_id and channel_playlist_id.startswith('UC'):
2994 playlist_id = 'UU' + channel_playlist_id[2:]
2995 return self.url_result(
2996 compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
2997
2998 channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
2999 autogenerated = re.search(r'''(?x)
3000 class="[^"]*?(?:
3001 channel-header-autogenerated-label|
3002 yt-channel-title-autogenerated
3003 )[^"]*"''', channel_page) is not None
3004
3005 if autogenerated:
3006 # The videos are contained in a single page
3007 # the ajax pages can't be used, they are empty
3008 entries = [
3009 self.url_result(
3010 video_id, 'Youtube', video_id=video_id,
3011 video_title=video_title)
3012 for video_id, video_title in self.extract_videos_from_page(channel_page)]
3013 return self.playlist_result(entries, channel_id)
3014
3015 try:
3016 next(self._entries(channel_page, channel_id))
3017 except StopIteration:
3018 alert_message = self._html_search_regex(
3019 r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
3020 channel_page, 'alert', default=None, group='alert')
3021 if alert_message:
3022 raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
3023
3024 return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
3025
3026
3027class YoutubeUserIE(YoutubeChannelIE):
3028 IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
3029 _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
3030 _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
3031 IE_NAME = 'youtube:user'
3032
3033 _TESTS = [{
3034 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
3035 'playlist_mincount': 320,
3036 'info_dict': {
3037 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
3038 'title': 'Uploads from The Linux Foundation',
3039 'uploader': 'The Linux Foundation',
3040 'uploader_id': 'TheLinuxFoundation',
3041 }
3042 }, {
3043 # Only available via https://www.youtube.com/c/12minuteathlete/videos
3044 # but not https://www.youtube.com/user/12minuteathlete/videos
3045 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
3046 'playlist_mincount': 249,
3047 'info_dict': {
3048 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
3049 'title': 'Uploads from 12 Minute Athlete',
3050 'uploader': '12 Minute Athlete',
3051 'uploader_id': 'the12minuteathlete',
3052 }
3053 }, {
3054 'url': 'ytuser:phihag',
3055 'only_matching': True,
3056 }, {
3057 'url': 'https://www.youtube.com/c/gametrailers',
3058 'only_matching': True,
3059 }, {
3060 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
3061 'only_matching': True,
3062 }, {
3063 'url': 'https://www.youtube.com/gametrailers',
3064 'only_matching': True,
3065 }, {
3066 # This channel is not available, geo restricted to JP
3067 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
3068 'only_matching': True,
3069 }]
3070
3071 @classmethod
3072 def suitable(cls, url):
3073 # Don't return True if the url can be extracted with other youtube
3074 # extractor, the regex would is too permissive and it would match.
3075 other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
3076 if any(ie.suitable(url) for ie in other_yt_ies):
3077 return False
3078 else:
3079 return super(YoutubeUserIE, cls).suitable(url)
3080
3081 def _build_template_url(self, url, channel_id):
3082 mobj = re.match(self._VALID_URL, url)
3083 return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
3084
3085
3086class YoutubeLiveIE(YoutubeBaseInfoExtractor):
3087 IE_DESC = 'YouTube.com live streams'
3088 _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
3089 IE_NAME = 'youtube:live'
3090
3091 _TESTS = [{
3092 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
3093 'info_dict': {
3094 'id': 'a48o2S1cPoo',
3095 'ext': 'mp4',
3096 'title': 'The Young Turks - Live Main Show',
3097 'uploader': 'The Young Turks',
3098 'uploader_id': 'TheYoungTurks',
3099 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
3100 'upload_date': '20150715',
3101 'license': 'Standard YouTube License',
3102 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
3103 'categories': ['News & Politics'],
3104 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
3105 'like_count': int,
3106 'dislike_count': int,
3107 },
3108 'params': {
3109 'skip_download': True,
3110 },
3111 }, {
3112 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
3113 'only_matching': True,
3114 }, {
3115 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
3116 'only_matching': True,
3117 }, {
3118 'url': 'https://www.youtube.com/TheYoungTurks/live',
3119 'only_matching': True,
3120 }]
3121
3122 def _real_extract(self, url):
3123 mobj = re.match(self._VALID_URL, url)
3124 channel_id = mobj.group('id')
3125 base_url = mobj.group('base_url')
3126 webpage = self._download_webpage(url, channel_id, fatal=False)
3127 if webpage:
3128 page_type = self._og_search_property(
3129 'type', webpage, 'page type', default='')
3130 video_id = self._html_search_meta(
3131 'videoId', webpage, 'video id', default=None)
3132 if page_type.startswith('video') and video_id and re.match(
3133 r'^[0-9A-Za-z_-]{11}$', video_id):
3134 return self.url_result(video_id, YoutubeIE.ie_key())
3135 return self.url_result(base_url)
3136
3137
3138class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
3139 IE_DESC = 'YouTube.com user/channel playlists'
3140 _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
3141 IE_NAME = 'youtube:playlists'
3142
3143 _TESTS = [{
3144 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
3145 'playlist_mincount': 4,
3146 'info_dict': {
3147 'id': 'ThirstForScience',
3148 'title': 'ThirstForScience',
3149 },
3150 }, {
3151 # with "Load more" button
3152 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
3153 'playlist_mincount': 70,
3154 'info_dict': {
3155 'id': 'igorkle1',
3156 'title': 'Игорь Клейнер',
3157 },
3158 }, {
3159 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
3160 'playlist_mincount': 17,
3161 'info_dict': {
3162 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
3163 'title': 'Chem Player',
3164 },
3165 'skip': 'Blocked',
3166 }, {
3167 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
3168 'only_matching': True,
3169 }]
3170
3171
3172class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
3173 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
3174
3175
3176class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
3177 IE_DESC = 'YouTube.com searches'
3178 # there doesn't appear to be a real limit, for example if you search for
3179 # 'python' you get more than 8.000.000 results
3180 _MAX_RESULTS = float('inf')
3181 IE_NAME = 'youtube:search'
3182 _SEARCH_KEY = 'ytsearch'
3183 _SEARCH_PARAMS = None
3184 _TESTS = []
3185
3186 def _entries(self, query, n):
3187 data = {
3188 'context': {
3189 'client': {
3190 'clientName': 'WEB',
3191 'clientVersion': '2.20201021.03.00',
3192 }
3193 },
3194 'query': query,
3195 }
3196 if self._SEARCH_PARAMS:
3197 data['params'] = self._SEARCH_PARAMS
3198 total = 0
3199 for page_num in itertools.count(1):
3200 search = self._download_json(
3201 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
3202 video_id='query "%s"' % query,
3203 note='Downloading page %s' % page_num,
3204 errnote='Unable to download API page', fatal=False,
3205 data=json.dumps(data).encode('utf8'),
3206 headers={'content-type': 'application/json'})
3207 if not search:
3208 break
3209 slr_contents = try_get(
3210 search,
3211 (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'],
3212 lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']),
3213 list)
3214 if not slr_contents:
3215 break
3216 isr_contents = try_get(
3217 slr_contents,
3218 lambda x: x[0]['itemSectionRenderer']['contents'],
3219 list)
3220 if not isr_contents:
3221 break
3222 for content in isr_contents:
3223 if not isinstance(content, dict):
3224 continue
3225 video = content.get('videoRenderer')
3226 if not isinstance(video, dict):
3227 continue
3228 video_id = video.get('videoId')
3229 if not video_id:
3230 continue
3231 title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
3232 description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
3233 duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
3234 view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
3235 view_count = int_or_none(self._search_regex(
3236 r'^(\d+)', re.sub(r'\s', '', view_count_text),
3237 'view count', default=None))
3238 uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
3239 total += 1
3240 yield {
3241 '_type': 'url_transparent',
3242 'ie_key': YoutubeIE.ie_key(),
3243 'id': video_id,
3244 'url': video_id,
3245 'title': title,
3246 'description': description,
3247 'duration': duration,
3248 'view_count': view_count,
3249 'uploader': uploader,
3250 }
3251 if total == n:
3252 return
3253 token = try_get(
3254 slr_contents,
3255 lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
3256 compat_str)
3257 if not token:
3258 break
3259 data['continuation'] = token
3260
3261 def _get_n_results(self, query, n):
3262 """Get a specified number of results for a query"""
3263 return self.playlist_result(self._entries(query, n), query)
3264
3265
3266class YoutubeSearchDateIE(YoutubeSearchIE):
3267 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
3268 _SEARCH_KEY = 'ytsearchdate'
3269 IE_DESC = 'YouTube.com searches, newest videos first'
3270 _SEARCH_PARAMS = 'CAI%3D'
3271
3272
3273class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
3274 IE_DESC = 'YouTube.com search URLs'
3275 IE_NAME = 'youtube:search_url'
3276 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
3277 _TESTS = [{
3278 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
3279 'playlist_mincount': 5,
3280 'info_dict': {
3281 'title': 'youtube-dl test video',
3282 }
3283 }, {
3284 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
3285 'only_matching': True,
3286 }]
3287
3288 def _real_extract(self, url):
3289 mobj = re.match(self._VALID_URL, url)
3290 query = compat_urllib_parse_unquote_plus(mobj.group('query'))
3291 webpage = self._download_webpage(url, query)
3292 return self.playlist_result(self._process_page(webpage), playlist_title=query)
3293
3294
3295class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
3296 IE_DESC = 'YouTube.com (multi-season) shows'
3297 _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
3298 IE_NAME = 'youtube:show'
3299 _TESTS = [{
3300 'url': 'https://www.youtube.com/show/airdisasters',
3301 'playlist_mincount': 5,
3302 'info_dict': {
3303 'id': 'airdisasters',
3304 'title': 'Air Disasters',
3305 }
3306 }]
3307
3308 def _real_extract(self, url):
3309 playlist_id = self._match_id(url)
3310 return super(YoutubeShowIE, self)._real_extract(
3311 'https://www.youtube.com/show/%s/playlists' % playlist_id)
3312
3313
3314class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
3315 """
3316 Base class for feed extractors
3317 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
3318 """
3319 _LOGIN_REQUIRED = True
3320
3321 @property
3322 def IE_NAME(self):
3323 return 'youtube:%s' % self._FEED_NAME
3324
3325 def _real_initialize(self):
3326 self._login()
3327
3328 def _entries(self, page):
3329 # The extraction process is the same as for playlists, but the regex
3330 # for the video ids doesn't contain an index
3331 ids = []
3332 more_widget_html = content_html = page
3333 for page_num in itertools.count(1):
3334 matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
3335
3336 # 'recommended' feed has infinite 'load more' and each new portion spins
3337 # the same videos in (sometimes) slightly different order, so we'll check
3338 # for unicity and break when portion has no new videos
3339 new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
3340 if not new_ids:
3341 break
3342
3343 ids.extend(new_ids)
3344
3345 for entry in self._ids_to_results(new_ids):
3346 yield entry
3347
3348 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
3349 if not mobj:
3350 break
3351
3352 more = self._download_json(
3353 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
3354 'Downloading page #%s' % page_num,
3355 transform_source=uppercase_escape,
3356 headers=self._YOUTUBE_CLIENT_HEADERS)
3357 content_html = more['content_html']
3358 more_widget_html = more['load_more_widget_html']
3359
3360 def _real_extract(self, url):
3361 page = self._download_webpage(
3362 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
3363 self._PLAYLIST_TITLE)
3364 return self.playlist_result(
3365 self._entries(page), playlist_title=self._PLAYLIST_TITLE)
3366
3367
3368class YoutubeWatchLaterIE(YoutubePlaylistIE):
3369 IE_NAME = 'youtube:watchlater'
3370 IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
3371 _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
3372
3373 _TESTS = [{
3374 'url': 'https://www.youtube.com/playlist?list=WL',
3375 'only_matching': True,
3376 }, {
3377 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
3378 'only_matching': True,
3379 }]
3380
3381 def _real_extract(self, url):
3382 _, video = self._check_download_just_video(url, 'WL')
3383 if video:
3384 return video
3385 _, playlist = self._extract_playlist('WL')
3386 return playlist
3387
3388
3389class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
3390 IE_NAME = 'youtube:favorites'
3391 IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
3392 _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
3393 _LOGIN_REQUIRED = True
3394
3395 def _real_extract(self, url):
3396 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
3397 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
3398 return self.url_result(playlist_id, 'YoutubePlaylist')
3399
3400
3401class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
3402 IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
3403 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
3404 _FEED_NAME = 'recommended'
3405 _PLAYLIST_TITLE = 'Youtube Recommended videos'
3406
3407
3408class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
3409 IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
3410 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
3411 _FEED_NAME = 'subscriptions'
3412 _PLAYLIST_TITLE = 'Youtube Subscriptions'
3413
3414
3415class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
3416 IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
3417 _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
3418 _FEED_NAME = 'history'
3419 _PLAYLIST_TITLE = 'Youtube History'
3420
3421
3422class YoutubeTruncatedURLIE(InfoExtractor):
3423 IE_NAME = 'youtube:truncated_url'
3424 IE_DESC = False # Do not list
3425 _VALID_URL = r'''(?x)
3426 (?:https?://)?
3427 (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
3428 (?:watch\?(?:
3429 feature=[a-z_]+|
3430 annotation_id=annotation_[^&]+|
3431 x-yt-cl=[0-9]+|
3432 hl=[^&]*|
3433 t=[0-9]+
3434 )?
3435 |
3436 attribution_link\?a=[^&]+
3437 )
3438 $
3439 '''
3440
3441 _TESTS = [{
3442 'url': 'https://www.youtube.com/watch?annotation_id=annotation_3951667041',
3443 'only_matching': True,
3444 }, {
3445 'url': 'https://www.youtube.com/watch?',
3446 'only_matching': True,
3447 }, {
3448 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
3449 'only_matching': True,
3450 }, {
3451 'url': 'https://www.youtube.com/watch?feature=foo',
3452 'only_matching': True,
3453 }, {
3454 'url': 'https://www.youtube.com/watch?hl=en-GB',
3455 'only_matching': True,
3456 }, {
3457 'url': 'https://www.youtube.com/watch?t=2372',
3458 'only_matching': True,
3459 }]
3460
3461 def _real_extract(self, url):
3462 raise ExtractorError(
3463 'Did you forget to quote the URL? Remember that & is a meta '
3464 'character in most shells, so you want to put the URL in quotes, '
3465 'like youtube-dl '
3466 '"https://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
3467 ' or simply youtube-dl BaW_jenozKc .',
3468 expected=True)
3469
3470
3471class YoutubeTruncatedIDIE(InfoExtractor):
3472 IE_NAME = 'youtube:truncated_id'
3473 IE_DESC = False # Do not list
3474 _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
3475
3476 _TESTS = [{
3477 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',
3478 'only_matching': True,
3479 }]
3480
3481 def _real_extract(self, url):
3482 video_id = self._match_id(url)
3483 raise ExtractorError(
3484 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
3485 expected=True)