Pms76fSN

· 4 years ago · Feb 08, 2021, 06:50 AM
1#!/usr/bin/python3
2# https://horriblevideos.com/page2.html
3
4import shlex, subprocess
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18# Abstracts away the database
19class VidyaDatabase:
20    class VideoMetaDoesntExist(Exception):
21        pass
22
23    # Fields used by this program
24    #   id
25    #   meta_batch_id
26    #   video_download_batch_id
27    
28    # Fields in the table "hvideo" which contain data meta info from the site
29    #   date_added
30    #   thumbnail_filename
31    #   thumbnail_url
32    #   video_filename
33    #   video_title
34    #   video_views
35    #   rating
36    #   video_id
37    #   video_duration
38    #   tags
39    #   description
40
41    def __init__(self, save_path):
42        self.db_name = save_path + "horrible.db"
43        self.conn = sqlite3.connect(self.db_name)
44        self.conn.row_factory = sqlite3.Row
45        self.c = self.conn.cursor()
46        self.closed = False
47
48    def __del__(self):
49        self._close_db()
50
51    def _close_db(self):
52        if(self.closed == False):
53            self.conn.commit()
54            self.conn.close()
55            self.closed = True
56
57    # Returns a specified video ID. Doesn't include the column name as this is only used
58    # to see if there is an entry in the DB
59    def get_video_meta(self, video_id):
60        params = (video_id,)
61        entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
62        return entries.fetchone()
63
64    # Finds the first video entry which has the video_filename column NULL and returns all the column values
65    def find_undownloaded_video(self, quantity_to_fetch, min_video_id=0, max_video_id=9999999):
66        null_videos = self.c.execute('SELECT * FROM hvideo WHERE video_filename IS NULL AND video_id >= ? AND video_id <= ?', (min_video_id, max_video_id))
67        prepared_results = []
68
69        for index in range(quantity_to_fetch):
70            null_video_row =  null_videos.fetchone()
71            values = tuple(null_video_row)
72            prepared_result = {}
73            
74            current_index = 0
75            for row_key in null_video_row.keys():
76                prepared_result[row_key] = values[current_index]
77                current_index = current_index + 1
78            
79            prepared_results.append(prepared_result)
80        
81        return prepared_results
82    
83    # Use this function to update meta data, creates the row if it doesn't already exist
84    def change_video_meta(self, video_id, this_batch_id, video_params):
85        sanatized_params = {}
86        # Strip any fields in video_params which aren't allowed to be updated
87        valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
88
89
90        for key in video_params:
91            if(key in valid_fields):
92                sanatized_params[key] = video_params[key]
93
94        # Update the batch ID
95        # TODO: Make sure this batch doesn't match any other ones
96        sanatized_params['meta_batch_id'] = this_batch_id
97
98        # Determine if we need to create a new record
99        if(self.get_video_meta(video_id) == None):           
100            self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
101            self.conn.commit()
102
103        self._update_video_meta(video_id, this_batch_id, sanatized_params)
104    
105    # Internal use, for when the record already exists
106    def _update_video_meta(self, video_id, this_batch_id, video_params):
107        # UPDATE Customers
108        # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
109        # WHERE CustomerID = 1; 
110
111        # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
112        # print("Horse: " + param, video_params[param])
113        # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
114
115        # Build our SQL Statement
116        param_list = []
117        sql = "UPDATE hvideo SET "
118        current_param_index = 0
119        for param in video_params:
120            current_param_index = current_param_index + 1
121            sql = sql + param + " = ?"
122            if(current_param_index < len(video_params)):
123                sql = sql + ", "
124            else:
125                sql = sql + " "
126            
127            param_list.append(video_params[param])
128        
129        sql = sql + "WHERE video_id = ?"
130
131        param_list.append(video_id)
132        
133        self.c.execute(sql, param_list)
134        self.conn.commit()
135        
136    
137
138    
139# High level class which specifically strips horriblevideos.com
140class VidyaStripper:
141    # Thrown when we're trying to get the info for a page that isn't there
142    class PageBrowseInvalid(Exception):
143        def __init__(self, page_number):
144            self.page_number = page_number
145
146    # Thrown when a tag contains a semi-colon
147    class TagNameContainsSemicolon(Exception):
148        pass
149
150    def __init__(self, strip_page_root_url):
151        self.strip_page_root_url = strip_page_root_url
152
153    def get_page_browse_links(self, page_number):
154        # Array of all the info we strip from the Page View
155        horrible_data_items = []
156
157        # Get the data
158        page_url = self.get_page_url(page_number)
159        page = requests.get(page_url)
160        soup = BeautifulSoup(page.text, "html.parser")
161        if page.text.find("Sorry, no results were found.",0) != -1:
162            raise VidyaStripper.PageBrowseInvalid(page_number)
163
164        # Parse out the elements with videos
165        horrible_items = soup.find_all("div", attrs={"class": "col-item"})
166        
167        # Iterate the elements and pull the data
168        for horrible_item in horrible_items:
169            # Reset this stored data
170            horrible_data_item = {}
171
172            # Get the Thumbnail
173            thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
174            for thumbnail_url in thumbnail_urls:
175                horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
176            
177            # Get the URL to the page showing the video
178            video_page_url = horrible_item.find("a").attrs['href']
179            horrible_data_item['video_page_url'] = video_page_url
180            
181            # Get the ID of the video based on the URL
182            dash_pos = video_page_url.rfind("-")
183            video_id = video_page_url[dash_pos+1:-5]
184            horrible_data_item['video_id'] = video_id
185
186            # Get the title of the video
187            video_title = horrible_item.find("span", attrs={"class": "title"}).text
188            horrible_data_item['video_title'] = video_title
189
190            # Add the item onto the list
191            horrible_data_items.append(horrible_data_item)
192        
193        return horrible_data_items 
194    
195    def get_video_info(self, video_url):
196        video_page_info = {}
197        page = requests.get(video_url)
198        soup = BeautifulSoup(page.text, "html.parser")
199        if page.text.find("Page you are looking for does not exist",0) != -1:
200            raise VidyaStripper.PageBrowseInvalid
201
202        # Get the rating
203        hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
204        if(hvrating is not None):
205            video_page_info['rating'] = hvrating.text.strip()
206    
207        # Get the Duration of video
208        video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
209        if(video_duration is not None):
210            video_page_info['video_duration'] = video_duration.nextSibling.strip()
211
212        # Get the number of views
213        hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
214        if(hvvideo_views is not None):
215            video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
216
217        # get the date it was added
218        hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
219        if(hvdate_added is not None):
220            video_page_info['date_added'] = hvdate_added.nextSibling.strip()
221
222        # Get the description
223        hvdescription = soup.find("div", attrs={"class": "description"})
224        if(hvdescription is not None):
225            video_page_info['description'] = hvdescription.text.strip()
226
227        # Get the tags
228        tag_list = []
229        tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
230        if(tags_area is not None):
231            tags = tags_area.parent.find_all("a")
232            for tag in tags:
233                if(tag.text.find(";") != -1):
234                    print("When stripping a video there was a tag with a semi colon")
235                    print("Video URL: " + video_url)
236                    raise VidyaStripper.TagNameContainsSemicolon
237
238                tag_list.append(tag.text)
239            string_tag_list = ";".join(tag_list)
240            video_page_info['tags'] = string_tag_list
241
242        return video_page_info
243    
244    def get_page_url(self, page_number):
245        return self.strip_page_root_url + "page" + str(page_number) + ".html"
246
247
248# Variables
249save_path = "/media/james/Bad Stuff/Video Strip/"
250strip_page_root_url = "https://horriblevideos.com/"
251
252
253####################################
254# One main function of this script #
255####################################
256
257####################################################
258# Pool Object that gets the list of videos to grab #
259####################################################
260def get_page_meta(x):
261    meta_page_number, batch_id = x
262
263    # Create our own stripper object
264    strippy = VidyaStripper(strip_page_root_url)
265
266    print("Getting page: ", meta_page_number)
267
268    try:
269        page_links = strippy.get_page_browse_links(meta_page_number)
270    except VidyaStripper.PageBrowseInvalid as err:
271        print("Page Number " + str(err.page_number) + " is invalid")
272
273    # print(page_links)
274    for link in page_links:
275        dby = VidyaDatabase(save_path)
276
277        # Check if we already have the Metadata for this video, if so, skip it
278        if(dby.get_video_meta(link['video_id']) != None):
279            print("Skipping video " + link['video_id'])
280            continue
281
282        # This video doesn't have any meta data, download it
283        video_page_dat = strippy.get_video_info(link['video_page_url'])
284        
285        # Combine the information we got from the page load and the specific videos page
286        merged_dict = {**link, **video_page_dat}
287
288        # Update the data base
289        dby.change_video_meta(link['video_id'], batch_id, merged_dict)
290
291    return "Updated page " + str(meta_page_number)
292
293##############################
294# Main part of this function #
295##############################
296def gather_video_metadata():
297    d = datetime.utcnow()
298    batch_id = calendar.timegm(d.utctimetuple())
299
300    start_page = 1
301    do_pages = 20
302
303    # Generate a list of stuff to work on
304    datas = []
305    
306    for page_number in range(start_page, start_page + do_pages):
307        datas.append((page_number, batch_id))
308    
309    # Create a pool of 5 threads to get all the data of a specified page number
310    with Pool(40) as p:
311       p.map(get_page_meta, datas)
312
313#######################################
314# Second main function of this script #
315#######################################
316def download_vidya_and_update(params):
317    result = {}
318
319    vidya, batch_id = params
320    
321    output_filename = str(vidya['video_id']) + ".mp4"
322    output_path = save_path + "Videos/"
323    youtube_dl_output = subprocess.run(["youtube-dl", vidya['video_page_url'], "-o", output_path + output_filename], stdout=subprocess.PIPE)
324
325    result['video_id'] = vidya['video_id']
326
327    # There was an error
328    if(youtube_dl_output.returncode == 0):
329        dby = VidyaDatabase(save_path)
330        video_params = {}
331        video_params['video_filename'] = output_filename
332        dby.change_video_meta(vidya['video_id'], batch_id, video_params)
333
334        result['type'] = "success"
335        result['filename'] = output_filename
336    else:
337        result['type'] = "error"
338        result['stdout'] = youtube_dl_output.stdout
339        result['stderr'] = youtube_dl_output.stderr
340
341    return result
342
343
344def download_videos():
345    d = datetime.utcnow()
346    batch_id = calendar.timegm(d.utctimetuple())
347
348    dby = VidyaDatabase(save_path)
349    undownloaded_vidyas =  dby.find_undownloaded_video(40, 0, 2500)
350    dby._close_db()
351
352    datas = []
353
354    for undownloaded_vidya in undownloaded_vidyas:
355        datas.append((undownloaded_vidya, batch_id))
356
357    with Pool(20) as p:
358        results = p.map(download_vidya_and_update, datas)
359
360###################################
361# Main entry point of the program #
362###################################
363if __name__ == '__main__':
364    # This is a crude little program, uncomment the function that you want to perform #
365    # ---------------------------------------------------------------------------------
366
367    # Function 1
368    gather_video_metadata()
369
370    # Function 2
371    # download_videos()
372    
373