· 4 years ago · Feb 08, 2021, 06:50 AM
1#!/usr/bin/python3
2# https://horriblevideos.com/page2.html
3
4import shlex, subprocess
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18# Abstracts away the database
19class VidyaDatabase:
20 class VideoMetaDoesntExist(Exception):
21 pass
22
23 # Fields used by this program
24 # id
25 # meta_batch_id
26 # video_download_batch_id
27
28 # Fields in the table "hvideo" which contain data meta info from the site
29 # date_added
30 # thumbnail_filename
31 # thumbnail_url
32 # video_filename
33 # video_title
34 # video_views
35 # rating
36 # video_id
37 # video_duration
38 # tags
39 # description
40
41 def __init__(self, save_path):
42 self.db_name = save_path + "horrible.db"
43 self.conn = sqlite3.connect(self.db_name)
44 self.conn.row_factory = sqlite3.Row
45 self.c = self.conn.cursor()
46 self.closed = False
47
48 def __del__(self):
49 self._close_db()
50
51 def _close_db(self):
52 if(self.closed == False):
53 self.conn.commit()
54 self.conn.close()
55 self.closed = True
56
57 # Returns a specified video ID. Doesn't include the column name as this is only used
58 # to see if there is an entry in the DB
59 def get_video_meta(self, video_id):
60 params = (video_id,)
61 entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
62 return entries.fetchone()
63
64 # Finds the first video entry which has the video_filename column NULL and returns all the column values
65 def find_undownloaded_video(self, quantity_to_fetch, min_video_id=0, max_video_id=9999999):
66 null_videos = self.c.execute('SELECT * FROM hvideo WHERE video_filename IS NULL AND video_id >= ? AND video_id <= ?', (min_video_id, max_video_id))
67 prepared_results = []
68
69 for index in range(quantity_to_fetch):
70 null_video_row = null_videos.fetchone()
71 values = tuple(null_video_row)
72 prepared_result = {}
73
74 current_index = 0
75 for row_key in null_video_row.keys():
76 prepared_result[row_key] = values[current_index]
77 current_index = current_index + 1
78
79 prepared_results.append(prepared_result)
80
81 return prepared_results
82
83 # Use this function to update meta data, creates the row if it doesn't already exist
84 def change_video_meta(self, video_id, this_batch_id, video_params):
85 sanatized_params = {}
86 # Strip any fields in video_params which aren't allowed to be updated
87 valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
88
89
90 for key in video_params:
91 if(key in valid_fields):
92 sanatized_params[key] = video_params[key]
93
94 # Update the batch ID
95 # TODO: Make sure this batch doesn't match any other ones
96 sanatized_params['meta_batch_id'] = this_batch_id
97
98 # Determine if we need to create a new record
99 if(self.get_video_meta(video_id) == None):
100 self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
101 self.conn.commit()
102
103 self._update_video_meta(video_id, this_batch_id, sanatized_params)
104
105 # Internal use, for when the record already exists
106 def _update_video_meta(self, video_id, this_batch_id, video_params):
107 # UPDATE Customers
108 # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
109 # WHERE CustomerID = 1;
110
111 # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
112 # print("Horse: " + param, video_params[param])
113 # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
114
115 # Build our SQL Statement
116 param_list = []
117 sql = "UPDATE hvideo SET "
118 current_param_index = 0
119 for param in video_params:
120 current_param_index = current_param_index + 1
121 sql = sql + param + " = ?"
122 if(current_param_index < len(video_params)):
123 sql = sql + ", "
124 else:
125 sql = sql + " "
126
127 param_list.append(video_params[param])
128
129 sql = sql + "WHERE video_id = ?"
130
131 param_list.append(video_id)
132
133 self.c.execute(sql, param_list)
134 self.conn.commit()
135
136
137
138
139# High level class which specifically strips horriblevideos.com
140class VidyaStripper:
141 # Thrown when we're trying to get the info for a page that isn't there
142 class PageBrowseInvalid(Exception):
143 def __init__(self, page_number):
144 self.page_number = page_number
145
146 # Thrown when a tag contains a semi-colon
147 class TagNameContainsSemicolon(Exception):
148 pass
149
150 def __init__(self, strip_page_root_url):
151 self.strip_page_root_url = strip_page_root_url
152
153 def get_page_browse_links(self, page_number):
154 # Array of all the info we strip from the Page View
155 horrible_data_items = []
156
157 # Get the data
158 page_url = self.get_page_url(page_number)
159 page = requests.get(page_url)
160 soup = BeautifulSoup(page.text, "html.parser")
161 if page.text.find("Sorry, no results were found.",0) != -1:
162 raise VidyaStripper.PageBrowseInvalid(page_number)
163
164 # Parse out the elements with videos
165 horrible_items = soup.find_all("div", attrs={"class": "col-item"})
166
167 # Iterate the elements and pull the data
168 for horrible_item in horrible_items:
169 # Reset this stored data
170 horrible_data_item = {}
171
172 # Get the Thumbnail
173 thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
174 for thumbnail_url in thumbnail_urls:
175 horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
176
177 # Get the URL to the page showing the video
178 video_page_url = horrible_item.find("a").attrs['href']
179 horrible_data_item['video_page_url'] = video_page_url
180
181 # Get the ID of the video based on the URL
182 dash_pos = video_page_url.rfind("-")
183 video_id = video_page_url[dash_pos+1:-5]
184 horrible_data_item['video_id'] = video_id
185
186 # Get the title of the video
187 video_title = horrible_item.find("span", attrs={"class": "title"}).text
188 horrible_data_item['video_title'] = video_title
189
190 # Add the item onto the list
191 horrible_data_items.append(horrible_data_item)
192
193 return horrible_data_items
194
195 def get_video_info(self, video_url):
196 video_page_info = {}
197 page = requests.get(video_url)
198 soup = BeautifulSoup(page.text, "html.parser")
199 if page.text.find("Page you are looking for does not exist",0) != -1:
200 raise VidyaStripper.PageBrowseInvalid
201
202 # Get the rating
203 hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
204 if(hvrating is not None):
205 video_page_info['rating'] = hvrating.text.strip()
206
207 # Get the Duration of video
208 video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
209 if(video_duration is not None):
210 video_page_info['video_duration'] = video_duration.nextSibling.strip()
211
212 # Get the number of views
213 hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
214 if(hvvideo_views is not None):
215 video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
216
217 # get the date it was added
218 hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
219 if(hvdate_added is not None):
220 video_page_info['date_added'] = hvdate_added.nextSibling.strip()
221
222 # Get the description
223 hvdescription = soup.find("div", attrs={"class": "description"})
224 if(hvdescription is not None):
225 video_page_info['description'] = hvdescription.text.strip()
226
227 # Get the tags
228 tag_list = []
229 tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
230 if(tags_area is not None):
231 tags = tags_area.parent.find_all("a")
232 for tag in tags:
233 if(tag.text.find(";") != -1):
234 print("When stripping a video there was a tag with a semi colon")
235 print("Video URL: " + video_url)
236 raise VidyaStripper.TagNameContainsSemicolon
237
238 tag_list.append(tag.text)
239 string_tag_list = ";".join(tag_list)
240 video_page_info['tags'] = string_tag_list
241
242 return video_page_info
243
244 def get_page_url(self, page_number):
245 return self.strip_page_root_url + "page" + str(page_number) + ".html"
246
247
248# Variables
249save_path = "/media/james/Bad Stuff/Video Strip/"
250strip_page_root_url = "https://horriblevideos.com/"
251
252
253####################################
254# One main function of this script #
255####################################
256
257####################################################
258# Pool Object that gets the list of videos to grab #
259####################################################
260def get_page_meta(x):
261 meta_page_number, batch_id = x
262
263 # Create our own stripper object
264 strippy = VidyaStripper(strip_page_root_url)
265
266 print("Getting page: ", meta_page_number)
267
268 try:
269 page_links = strippy.get_page_browse_links(meta_page_number)
270 except VidyaStripper.PageBrowseInvalid as err:
271 print("Page Number " + str(err.page_number) + " is invalid")
272
273 # print(page_links)
274 for link in page_links:
275 dby = VidyaDatabase(save_path)
276
277 # Check if we already have the Metadata for this video, if so, skip it
278 if(dby.get_video_meta(link['video_id']) != None):
279 print("Skipping video " + link['video_id'])
280 continue
281
282 # This video doesn't have any meta data, download it
283 video_page_dat = strippy.get_video_info(link['video_page_url'])
284
285 # Combine the information we got from the page load and the specific videos page
286 merged_dict = {**link, **video_page_dat}
287
288 # Update the data base
289 dby.change_video_meta(link['video_id'], batch_id, merged_dict)
290
291 return "Updated page " + str(meta_page_number)
292
293##############################
294# Main part of this function #
295##############################
296def gather_video_metadata():
297 d = datetime.utcnow()
298 batch_id = calendar.timegm(d.utctimetuple())
299
300 start_page = 1
301 do_pages = 20
302
303 # Generate a list of stuff to work on
304 datas = []
305
306 for page_number in range(start_page, start_page + do_pages):
307 datas.append((page_number, batch_id))
308
309 # Create a pool of 5 threads to get all the data of a specified page number
310 with Pool(40) as p:
311 p.map(get_page_meta, datas)
312
313#######################################
314# Second main function of this script #
315#######################################
316def download_vidya_and_update(params):
317 result = {}
318
319 vidya, batch_id = params
320
321 output_filename = str(vidya['video_id']) + ".mp4"
322 output_path = save_path + "Videos/"
323 youtube_dl_output = subprocess.run(["youtube-dl", vidya['video_page_url'], "-o", output_path + output_filename], stdout=subprocess.PIPE)
324
325 result['video_id'] = vidya['video_id']
326
327 # There was an error
328 if(youtube_dl_output.returncode == 0):
329 dby = VidyaDatabase(save_path)
330 video_params = {}
331 video_params['video_filename'] = output_filename
332 dby.change_video_meta(vidya['video_id'], batch_id, video_params)
333
334 result['type'] = "success"
335 result['filename'] = output_filename
336 else:
337 result['type'] = "error"
338 result['stdout'] = youtube_dl_output.stdout
339 result['stderr'] = youtube_dl_output.stderr
340
341 return result
342
343
344def download_videos():
345 d = datetime.utcnow()
346 batch_id = calendar.timegm(d.utctimetuple())
347
348 dby = VidyaDatabase(save_path)
349 undownloaded_vidyas = dby.find_undownloaded_video(40, 0, 2500)
350 dby._close_db()
351
352 datas = []
353
354 for undownloaded_vidya in undownloaded_vidyas:
355 datas.append((undownloaded_vidya, batch_id))
356
357 with Pool(20) as p:
358 results = p.map(download_vidya_and_update, datas)
359
360###################################
361# Main entry point of the program #
362###################################
363if __name__ == '__main__':
364 # This is a crude little program, uncomment the function that you want to perform #
365 # ---------------------------------------------------------------------------------
366
367 # Function 1
368 gather_video_metadata()
369
370 # Function 2
371 # download_videos()
372
373