QX4UWpjK

· 4 years ago · Feb 08, 2021, 07:22 AM
1#!/usr/bin/python3
2# https://horriblevideos.com/page2.html
3
4import shlex, subprocess
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18# Abstracts away the database
19class VidyaDatabase:
20    class VideoMetaDoesntExist(Exception):
21        pass
22
23    # Fields used by this program
24    #   id
25    #   meta_batch_id
26    #   video_download_batch_id
27    
28    # Fields in the table "hvideo" which contain data meta info from the site
29    #   date_added
30    #   thumbnail_filename
31    #   thumbnail_url
32    #   video_filename
33    #   video_title
34    #   video_views
35    #   rating
36    #   video_id
37    #   video_duration
38    #   tags
39    #   description
40
41    def __init__(self, save_path):
42        self.db_name = save_path + "horrible.db"
43        self.conn = sqlite3.connect(self.db_name)
44        self.conn.row_factory = sqlite3.Row
45        self.c = self.conn.cursor()
46        self.closed = False
47        self.setup_if_neccessary()
48
49    def setup_if_neccessary(self):
50        create_db_statement = """
51        CREATE TABLE "hvideo" (
52        id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
53        video_page_url TEXT,
54        date_added TEXT,
55        thumbnail_filename TEXT,
56        thumbnail_url TEXT,
57        video_filename TEXT,
58        meta_batch_id INTEGER,
59        video_download_batch_id INTEGER
60        , video_title TEXT, video_views INTEGER, rating INTEGER, video_id INTEGER, video_duration TEXT, tags TEXT, description TEXT);
61        """
62
63        hvideo_table = self.c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='hvideo';")
64        rows = hvideo_table.fetchmany()
65        if(len(rows) == 0):
66            self.c.execute(create_db_statement)
67            self.conn.commit()
68
69    def __del__(self):
70        self._close_db()
71
72    def _close_db(self):
73        if(self.closed == False):
74            self.conn.commit()
75            self.conn.close()
76            self.closed = True
77
78    # Returns a specified video ID. Doesn't include the column name as this is only used
79    # to see if there is an entry in the DB
80    def get_video_meta(self, video_id):
81        params = (video_id,)
82        entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
83        return entries.fetchone()
84
85    # Finds the first video entry which has the video_filename column NULL and returns all the column values
86    def find_undownloaded_video(self, quantity_to_fetch=200, min_video_id=0, max_video_id=9999999):
87        null_videos = self.c.execute('SELECT * FROM hvideo WHERE video_filename IS NULL AND video_id >= ? AND video_id <= ?', (min_video_id, max_video_id))
88        prepared_results = []
89
90        for index in range(quantity_to_fetch):
91            null_video_row =  null_videos.fetchone()
92            if(null_video_row is None):
93                break
94            values = tuple(null_video_row)
95            prepared_result = {}
96            
97            current_index = 0
98            for row_key in null_video_row.keys():
99                prepared_result[row_key] = values[current_index]
100                current_index = current_index + 1
101            
102            prepared_results.append(prepared_result)
103        
104        return prepared_results
105    
106    # Use this function to update meta data, creates the row if it doesn't already exist
107    def change_video_meta(self, video_id, this_batch_id, video_params):
108        sanatized_params = {}
109        # Strip any fields in video_params which aren't allowed to be updated
110        valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
111
112
113        for key in video_params:
114            if(key in valid_fields):
115                sanatized_params[key] = video_params[key]
116
117        # Update the batch ID
118        # TODO: Make sure this batch doesn't match any other ones
119        sanatized_params['meta_batch_id'] = this_batch_id
120
121        # Determine if we need to create a new record
122        if(self.get_video_meta(video_id) == None):           
123            self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
124            self.conn.commit()
125
126        self._update_video_meta(video_id, this_batch_id, sanatized_params)
127    
128    # Internal use, for when the record already exists
129    def _update_video_meta(self, video_id, this_batch_id, video_params):
130        # UPDATE Customers
131        # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
132        # WHERE CustomerID = 1; 
133
134        # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
135        # print("Horse: " + param, video_params[param])
136        # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
137
138        # Build our SQL Statement
139        param_list = []
140        sql = "UPDATE hvideo SET "
141        current_param_index = 0
142        for param in video_params:
143            current_param_index = current_param_index + 1
144            sql = sql + param + " = ?"
145            if(current_param_index < len(video_params)):
146                sql = sql + ", "
147            else:
148                sql = sql + " "
149            
150            param_list.append(video_params[param])
151        
152        sql = sql + "WHERE video_id = ?"
153
154        param_list.append(video_id)
155        
156        self.c.execute(sql, param_list)
157        self.conn.commit()
158        
159    
160
161    
162# High level class which specifically strips horriblevideos.com
163class VidyaStripper:
164    # Thrown when we're trying to get the info for a page that isn't there
165    class PageBrowseInvalid(Exception):
166        def __init__(self, page_number):
167            self.page_number = page_number
168
169    # Thrown when a tag contains a semi-colon
170    class TagNameContainsSemicolon(Exception):
171        pass
172
173    def __init__(self, strip_page_root_url):
174        self.strip_page_root_url = strip_page_root_url
175
176    def get_page_browse_links(self, page_number):
177        # Array of all the info we strip from the Page View
178        horrible_data_items = []
179
180        # Get the data
181        page_url = self.get_page_url(page_number)
182        page = requests.get(page_url)
183        soup = BeautifulSoup(page.text, "html.parser")
184        if page.text.find("Sorry, no results were found.",0) != -1:
185            raise VidyaStripper.PageBrowseInvalid(page_number)
186
187        # Parse out the elements with videos
188        horrible_items = soup.find_all("div", attrs={"class": "col-item"})
189        
190        # Iterate the elements and pull the data
191        for horrible_item in horrible_items:
192            # Reset this stored data
193            horrible_data_item = {}
194
195            # Get the Thumbnail
196            thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
197            for thumbnail_url in thumbnail_urls:
198                horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
199            
200            # Get the URL to the page showing the video
201            video_page_url = horrible_item.find("a").attrs['href']
202            horrible_data_item['video_page_url'] = video_page_url
203            
204            # Get the ID of the video based on the URL
205            dash_pos = video_page_url.rfind("-")
206            video_id = video_page_url[dash_pos+1:-5]
207            horrible_data_item['video_id'] = video_id
208
209            # Get the title of the video
210            video_title = horrible_item.find("span", attrs={"class": "title"}).text
211            horrible_data_item['video_title'] = video_title
212
213            # Add the item onto the list
214            horrible_data_items.append(horrible_data_item)
215        
216        return horrible_data_items 
217    
218    def get_video_info(self, video_url):
219        video_page_info = {}
220        page = requests.get(video_url)
221        soup = BeautifulSoup(page.text, "html.parser")
222        if page.text.find("Page you are looking for does not exist",0) != -1:
223            raise VidyaStripper.PageBrowseInvalid
224
225        # Get the rating
226        hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
227        if(hvrating is not None):
228            video_page_info['rating'] = hvrating.text.strip()
229    
230        # Get the Duration of video
231        video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
232        if(video_duration is not None):
233            video_page_info['video_duration'] = video_duration.nextSibling.strip()
234
235        # Get the number of views
236        hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
237        if(hvvideo_views is not None):
238            video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
239
240        # get the date it was added
241        hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
242        if(hvdate_added is not None):
243            video_page_info['date_added'] = hvdate_added.nextSibling.strip()
244
245        # Get the description
246        hvdescription = soup.find("div", attrs={"class": "description"})
247        if(hvdescription is not None):
248            video_page_info['description'] = hvdescription.text.strip()
249
250        # Get the tags
251        tag_list = []
252        tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
253        if(tags_area is not None):
254            tags = tags_area.parent.find_all("a")
255            for tag in tags:
256                if(tag.text.find(";") != -1):
257                    print("When stripping a video there was a tag with a semi colon")
258                    print("Video URL: " + video_url)
259                    raise VidyaStripper.TagNameContainsSemicolon
260
261                tag_list.append(tag.text)
262            string_tag_list = ";".join(tag_list)
263            video_page_info['tags'] = string_tag_list
264
265        return video_page_info
266    
267    def get_page_url(self, page_number):
268        return self.strip_page_root_url + "page" + str(page_number) + ".html"
269
270
271# Variables
272save_path = "/media/james/Bad Stuff/Video Strip/"
273strip_page_root_url = "https://horriblevideos.com/"
274
275
276####################################
277# One main function of this script #
278####################################
279
280####################################################
281# Pool Object that gets the list of videos to grab #
282####################################################
283def get_page_meta(x):
284    meta_page_number, batch_id = x
285
286    # Create our own stripper object
287    strippy = VidyaStripper(strip_page_root_url)
288
289    print("Getting page: ", meta_page_number)
290
291    try:
292        page_links = strippy.get_page_browse_links(meta_page_number)
293    except VidyaStripper.PageBrowseInvalid as err:
294        print("Page Number " + str(err.page_number) + " is invalid")
295
296    # print(page_links)
297    for link in page_links:
298        dby = VidyaDatabase(save_path)
299
300        # Check if we already have the Metadata for this video, if so, skip it
301        if(dby.get_video_meta(link['video_id']) != None):
302            print("Skipping video " + link['video_id'])
303            continue
304
305        # This video doesn't have any meta data, download it
306        video_page_dat = strippy.get_video_info(link['video_page_url'])
307        
308        # Combine the information we got from the page load and the specific videos page
309        merged_dict = {**link, **video_page_dat}
310
311        # Update the data base
312        dby.change_video_meta(link['video_id'], batch_id, merged_dict)
313
314    return "Updated page " + str(meta_page_number)
315
316##############################
317# Main part of this function #
318##############################
319def gather_video_metadata():
320    d = datetime.utcnow()
321    batch_id = calendar.timegm(d.utctimetuple())
322
323    start_page = 1
324    do_pages = 20
325
326    # Generate a list of stuff to work on
327    datas = []
328    
329    for page_number in range(start_page, start_page + do_pages):
330        datas.append((page_number, batch_id))
331    
332    # Create a pool of 5 threads to get all the data of a specified page number
333    with Pool(40) as p:
334       p.map(get_page_meta, datas)
335
336#######################################
337# Second main function of this script #
338#######################################
339def download_vidya_and_update(params):
340    result = {}
341
342    vidya, batch_id = params
343    
344    output_filename = str(vidya['video_id']) + ".mp4"
345    output_path = save_path + "Videos/"
346    youtube_dl_output = subprocess.run(["youtube-dl", vidya['video_page_url'], "-o", output_path + output_filename], stdout=subprocess.PIPE)
347
348    result['video_id'] = vidya['video_id']
349
350    # There was an error
351    if(youtube_dl_output.returncode == 0):
352        dby = VidyaDatabase(save_path)
353        video_params = {}
354        video_params['video_filename'] = output_filename
355        dby.change_video_meta(vidya['video_id'], batch_id, video_params)
356
357        result['type'] = "success"
358        result['filename'] = output_filename
359    else:
360        result['type'] = "error"
361        result['stdout'] = youtube_dl_output.stdout
362        result['stderr'] = youtube_dl_output.stderr
363
364    return result
365
366
367def download_videos(this_batch_count=400, minimum_video_id=2500, maximum_video_id=9999):
368    d = datetime.utcnow()
369    batch_id = calendar.timegm(d.utctimetuple())
370
371    dby = VidyaDatabase(save_path)
372    undownloaded_vidyas =  dby.find_undownloaded_video(this_batch_count, minimum_video_id, maximum_video_id)
373    dby._close_db()
374
375    datas = []
376
377    for undownloaded_vidya in undownloaded_vidyas:
378        datas.append((undownloaded_vidya, batch_id))
379
380    with Pool(50) as p:
381        results = p.map(download_vidya_and_update, datas)
382
383def create_database():
384    pass
385
386###################################
387# Main entry point of the program #
388###################################
389if __name__ == '__main__':
390    # This is a crude little program, uncomment the function that you want to perform #
391    # ---------------------------------------------------------------------------------
392
393    # Function 1
394    # gather_video_metadata()
395
396    # Function 2
397    download_videos(400, 0, 2500)
398    
399