· 4 years ago · Feb 08, 2021, 07:22 AM
1#!/usr/bin/python3
2# https://horriblevideos.com/page2.html
3
4import shlex, subprocess
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18# Abstracts away the database
19class VidyaDatabase:
20 class VideoMetaDoesntExist(Exception):
21 pass
22
23 # Fields used by this program
24 # id
25 # meta_batch_id
26 # video_download_batch_id
27
28 # Fields in the table "hvideo" which contain data meta info from the site
29 # date_added
30 # thumbnail_filename
31 # thumbnail_url
32 # video_filename
33 # video_title
34 # video_views
35 # rating
36 # video_id
37 # video_duration
38 # tags
39 # description
40
41 def __init__(self, save_path):
42 self.db_name = save_path + "horrible.db"
43 self.conn = sqlite3.connect(self.db_name)
44 self.conn.row_factory = sqlite3.Row
45 self.c = self.conn.cursor()
46 self.closed = False
47 self.setup_if_neccessary()
48
49 def setup_if_neccessary(self):
50 create_db_statement = """
51 CREATE TABLE "hvideo" (
52 id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
53 video_page_url TEXT,
54 date_added TEXT,
55 thumbnail_filename TEXT,
56 thumbnail_url TEXT,
57 video_filename TEXT,
58 meta_batch_id INTEGER,
59 video_download_batch_id INTEGER
60 , video_title TEXT, video_views INTEGER, rating INTEGER, video_id INTEGER, video_duration TEXT, tags TEXT, description TEXT);
61 """
62
63 hvideo_table = self.c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='hvideo';")
64 rows = hvideo_table.fetchmany()
65 if(len(rows) == 0):
66 self.c.execute(create_db_statement)
67 self.conn.commit()
68
69 def __del__(self):
70 self._close_db()
71
72 def _close_db(self):
73 if(self.closed == False):
74 self.conn.commit()
75 self.conn.close()
76 self.closed = True
77
78 # Returns a specified video ID. Doesn't include the column name as this is only used
79 # to see if there is an entry in the DB
80 def get_video_meta(self, video_id):
81 params = (video_id,)
82 entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
83 return entries.fetchone()
84
85 # Finds the first video entry which has the video_filename column NULL and returns all the column values
86 def find_undownloaded_video(self, quantity_to_fetch=200, min_video_id=0, max_video_id=9999999):
87 null_videos = self.c.execute('SELECT * FROM hvideo WHERE video_filename IS NULL AND video_id >= ? AND video_id <= ?', (min_video_id, max_video_id))
88 prepared_results = []
89
90 for index in range(quantity_to_fetch):
91 null_video_row = null_videos.fetchone()
92 if(null_video_row is None):
93 break
94 values = tuple(null_video_row)
95 prepared_result = {}
96
97 current_index = 0
98 for row_key in null_video_row.keys():
99 prepared_result[row_key] = values[current_index]
100 current_index = current_index + 1
101
102 prepared_results.append(prepared_result)
103
104 return prepared_results
105
106 # Use this function to update meta data, creates the row if it doesn't already exist
107 def change_video_meta(self, video_id, this_batch_id, video_params):
108 sanatized_params = {}
109 # Strip any fields in video_params which aren't allowed to be updated
110 valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
111
112
113 for key in video_params:
114 if(key in valid_fields):
115 sanatized_params[key] = video_params[key]
116
117 # Update the batch ID
118 # TODO: Make sure this batch doesn't match any other ones
119 sanatized_params['meta_batch_id'] = this_batch_id
120
121 # Determine if we need to create a new record
122 if(self.get_video_meta(video_id) == None):
123 self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
124 self.conn.commit()
125
126 self._update_video_meta(video_id, this_batch_id, sanatized_params)
127
128 # Internal use, for when the record already exists
129 def _update_video_meta(self, video_id, this_batch_id, video_params):
130 # UPDATE Customers
131 # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
132 # WHERE CustomerID = 1;
133
134 # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
135 # print("Horse: " + param, video_params[param])
136 # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
137
138 # Build our SQL Statement
139 param_list = []
140 sql = "UPDATE hvideo SET "
141 current_param_index = 0
142 for param in video_params:
143 current_param_index = current_param_index + 1
144 sql = sql + param + " = ?"
145 if(current_param_index < len(video_params)):
146 sql = sql + ", "
147 else:
148 sql = sql + " "
149
150 param_list.append(video_params[param])
151
152 sql = sql + "WHERE video_id = ?"
153
154 param_list.append(video_id)
155
156 self.c.execute(sql, param_list)
157 self.conn.commit()
158
159
160
161
162# High level class which specifically strips horriblevideos.com
163class VidyaStripper:
164 # Thrown when we're trying to get the info for a page that isn't there
165 class PageBrowseInvalid(Exception):
166 def __init__(self, page_number):
167 self.page_number = page_number
168
169 # Thrown when a tag contains a semi-colon
170 class TagNameContainsSemicolon(Exception):
171 pass
172
173 def __init__(self, strip_page_root_url):
174 self.strip_page_root_url = strip_page_root_url
175
176 def get_page_browse_links(self, page_number):
177 # Array of all the info we strip from the Page View
178 horrible_data_items = []
179
180 # Get the data
181 page_url = self.get_page_url(page_number)
182 page = requests.get(page_url)
183 soup = BeautifulSoup(page.text, "html.parser")
184 if page.text.find("Sorry, no results were found.",0) != -1:
185 raise VidyaStripper.PageBrowseInvalid(page_number)
186
187 # Parse out the elements with videos
188 horrible_items = soup.find_all("div", attrs={"class": "col-item"})
189
190 # Iterate the elements and pull the data
191 for horrible_item in horrible_items:
192 # Reset this stored data
193 horrible_data_item = {}
194
195 # Get the Thumbnail
196 thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
197 for thumbnail_url in thumbnail_urls:
198 horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
199
200 # Get the URL to the page showing the video
201 video_page_url = horrible_item.find("a").attrs['href']
202 horrible_data_item['video_page_url'] = video_page_url
203
204 # Get the ID of the video based on the URL
205 dash_pos = video_page_url.rfind("-")
206 video_id = video_page_url[dash_pos+1:-5]
207 horrible_data_item['video_id'] = video_id
208
209 # Get the title of the video
210 video_title = horrible_item.find("span", attrs={"class": "title"}).text
211 horrible_data_item['video_title'] = video_title
212
213 # Add the item onto the list
214 horrible_data_items.append(horrible_data_item)
215
216 return horrible_data_items
217
218 def get_video_info(self, video_url):
219 video_page_info = {}
220 page = requests.get(video_url)
221 soup = BeautifulSoup(page.text, "html.parser")
222 if page.text.find("Page you are looking for does not exist",0) != -1:
223 raise VidyaStripper.PageBrowseInvalid
224
225 # Get the rating
226 hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
227 if(hvrating is not None):
228 video_page_info['rating'] = hvrating.text.strip()
229
230 # Get the Duration of video
231 video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
232 if(video_duration is not None):
233 video_page_info['video_duration'] = video_duration.nextSibling.strip()
234
235 # Get the number of views
236 hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
237 if(hvvideo_views is not None):
238 video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
239
240 # get the date it was added
241 hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
242 if(hvdate_added is not None):
243 video_page_info['date_added'] = hvdate_added.nextSibling.strip()
244
245 # Get the description
246 hvdescription = soup.find("div", attrs={"class": "description"})
247 if(hvdescription is not None):
248 video_page_info['description'] = hvdescription.text.strip()
249
250 # Get the tags
251 tag_list = []
252 tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
253 if(tags_area is not None):
254 tags = tags_area.parent.find_all("a")
255 for tag in tags:
256 if(tag.text.find(";") != -1):
257 print("When stripping a video there was a tag with a semi colon")
258 print("Video URL: " + video_url)
259 raise VidyaStripper.TagNameContainsSemicolon
260
261 tag_list.append(tag.text)
262 string_tag_list = ";".join(tag_list)
263 video_page_info['tags'] = string_tag_list
264
265 return video_page_info
266
267 def get_page_url(self, page_number):
268 return self.strip_page_root_url + "page" + str(page_number) + ".html"
269
270
271# Variables
272save_path = "/media/james/Bad Stuff/Video Strip/"
273strip_page_root_url = "https://horriblevideos.com/"
274
275
276####################################
277# One main function of this script #
278####################################
279
280####################################################
281# Pool Object that gets the list of videos to grab #
282####################################################
283def get_page_meta(x):
284 meta_page_number, batch_id = x
285
286 # Create our own stripper object
287 strippy = VidyaStripper(strip_page_root_url)
288
289 print("Getting page: ", meta_page_number)
290
291 try:
292 page_links = strippy.get_page_browse_links(meta_page_number)
293 except VidyaStripper.PageBrowseInvalid as err:
294 print("Page Number " + str(err.page_number) + " is invalid")
295
296 # print(page_links)
297 for link in page_links:
298 dby = VidyaDatabase(save_path)
299
300 # Check if we already have the Metadata for this video, if so, skip it
301 if(dby.get_video_meta(link['video_id']) != None):
302 print("Skipping video " + link['video_id'])
303 continue
304
305 # This video doesn't have any meta data, download it
306 video_page_dat = strippy.get_video_info(link['video_page_url'])
307
308 # Combine the information we got from the page load and the specific videos page
309 merged_dict = {**link, **video_page_dat}
310
311 # Update the data base
312 dby.change_video_meta(link['video_id'], batch_id, merged_dict)
313
314 return "Updated page " + str(meta_page_number)
315
316##############################
317# Main part of this function #
318##############################
319def gather_video_metadata():
320 d = datetime.utcnow()
321 batch_id = calendar.timegm(d.utctimetuple())
322
323 start_page = 1
324 do_pages = 20
325
326 # Generate a list of stuff to work on
327 datas = []
328
329 for page_number in range(start_page, start_page + do_pages):
330 datas.append((page_number, batch_id))
331
332 # Create a pool of 5 threads to get all the data of a specified page number
333 with Pool(40) as p:
334 p.map(get_page_meta, datas)
335
336#######################################
337# Second main function of this script #
338#######################################
339def download_vidya_and_update(params):
340 result = {}
341
342 vidya, batch_id = params
343
344 output_filename = str(vidya['video_id']) + ".mp4"
345 output_path = save_path + "Videos/"
346 youtube_dl_output = subprocess.run(["youtube-dl", vidya['video_page_url'], "-o", output_path + output_filename], stdout=subprocess.PIPE)
347
348 result['video_id'] = vidya['video_id']
349
350 # There was an error
351 if(youtube_dl_output.returncode == 0):
352 dby = VidyaDatabase(save_path)
353 video_params = {}
354 video_params['video_filename'] = output_filename
355 dby.change_video_meta(vidya['video_id'], batch_id, video_params)
356
357 result['type'] = "success"
358 result['filename'] = output_filename
359 else:
360 result['type'] = "error"
361 result['stdout'] = youtube_dl_output.stdout
362 result['stderr'] = youtube_dl_output.stderr
363
364 return result
365
366
367def download_videos(this_batch_count=400, minimum_video_id=2500, maximum_video_id=9999):
368 d = datetime.utcnow()
369 batch_id = calendar.timegm(d.utctimetuple())
370
371 dby = VidyaDatabase(save_path)
372 undownloaded_vidyas = dby.find_undownloaded_video(this_batch_count, minimum_video_id, maximum_video_id)
373 dby._close_db()
374
375 datas = []
376
377 for undownloaded_vidya in undownloaded_vidyas:
378 datas.append((undownloaded_vidya, batch_id))
379
380 with Pool(50) as p:
381 results = p.map(download_vidya_and_update, datas)
382
383def create_database():
384 pass
385
386###################################
387# Main entry point of the program #
388###################################
389if __name__ == '__main__':
390 # This is a crude little program, uncomment the function that you want to perform #
391 # ---------------------------------------------------------------------------------
392
393 # Function 1
394 # gather_video_metadata()
395
396 # Function 2
397 download_videos(400, 0, 2500)
398
399