fx4VVxQB

· 4 years ago · Feb 08, 2021, 05:00 AM
1#!/usr/bin/python3
2
3# https://horriblevideos.com/page2.html
4
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18class PageBrowseInvalid(Exception):
19    def __init__(self, page_number):
20        self.page_number = page_number
21    
22class TagNameContainsSemicolon(Exception):
23    pass
24
25# Abstracts away the database
26class VidyaDatabase:
27    class VideoMetaDoesntExist(Exception):
28        pass
29
30    # Fields used by this program
31    #   id
32    #   meta_batch_id
33    #   video_download_batch_id
34    
35    # Fields in the table "hvideo" which contain data meta info from the site
36    #   date_added
37    #   thumbnail_filename
38    #   thumbnail_url
39    #   video_filename
40    #   video_title
41    #   video_views
42    #   rating
43    #   video_id
44    #   video_duration
45    #   tags
46    #   description
47
48    def __init__(self, save_path):
49        self.db_name = save_path + "horrible.db"
50        self.conn = sqlite3.connect(self.db_name)
51        self.c = self.conn.cursor()
52
53    def __del__(self):
54        self.conn.commit()
55        self.conn.close()
56
57    def get_video_meta(self, video_id):
58        params = (video_id,)
59        entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
60        return entries.fetchone()
61
62    
63    # Use this function to create or update meta data
64    def change_video_meta(self, video_id, this_batch_id, video_params):
65        sanatized_params = {}
66        # Strip any fields in video_params which aren't allowed to be updated
67        valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
68
69
70        for key in video_params:
71            if(key in valid_fields):
72                sanatized_params[key] = video_params[key]
73
74        # Update the batch ID
75        # TODO: Make sure this batch doesn't match any other ones
76        sanatized_params['meta_batch_id'] = this_batch_id
77
78        # Determine if we need to create a new record
79        if(self.get_video_meta(video_id) == None):           
80            self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
81            self.conn.commit()
82
83        self._update_video_meta(video_id, this_batch_id, sanatized_params)
84    
85    # Internal use, for when the record already exists
86    def _update_video_meta(self, video_id, this_batch_id, video_params):
87        # UPDATE Customers
88        # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
89        # WHERE CustomerID = 1; 
90
91        # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
92        # print("Horse: " + param, video_params[param])
93        # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
94
95        # Build our SQL Statement
96        param_list = []
97        sql = "UPDATE hvideo SET "
98        current_param_index = 0
99        for param in video_params:
100            current_param_index = current_param_index + 1
101            sql = sql + param + " = ?"
102            if(current_param_index < len(video_params)):
103                sql = sql + ", "
104            else:
105                sql = sql + " "
106            
107            param_list.append(video_params[param])
108        
109        sql = sql + "WHERE video_id = ?"
110
111        param_list.append(video_id)
112        
113        self.c.execute(sql, param_list)
114        self.conn.commit()
115        
116    
117
118class VidyaStripper:
119    def __init__(self, strip_page_root_url):
120        self.strip_page_root_url = strip_page_root_url
121
122    def get_page_browse_links(self, page_number):
123        # Array of all the info we strip from the Page View
124        horrible_data_items = []
125
126        # Get the data
127        page_url = self.get_page_url(page_number)
128        page = requests.get(page_url)
129        soup = BeautifulSoup(page.text, "html.parser")
130        if page.text.find("Sorry, no results were found.",0) != -1:
131            raise PageBrowseInvalid(page_number)
132
133        # Parse out the elements with videos
134        horrible_items = soup.find_all("div", attrs={"class": "col-item"})
135        
136        # Iterate the elements and pull the data
137        for horrible_item in horrible_items:
138            # Reset this stored data
139            horrible_data_item = {}
140
141            # Get the Thumbnail
142            thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
143            for thumbnail_url in thumbnail_urls:
144                horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
145            
146            # Get the URL to the page showing the video
147            video_page_url = horrible_item.find("a").attrs['href']
148            horrible_data_item['video_page_url'] = video_page_url
149            
150            # Get the ID of the video based on the URL
151            dash_pos = video_page_url.rfind("-")
152            video_id = video_page_url[dash_pos+1:-5]
153            horrible_data_item['video_id'] = video_id
154
155            # Get the title of the video
156            video_title = horrible_item.find("span", attrs={"class": "title"}).text
157            horrible_data_item['video_title'] = video_title
158
159            # Add the item onto the list
160            horrible_data_items.append(horrible_data_item)
161        
162        return horrible_data_items 
163    
164    def get_video_info(self, video_url):
165        video_page_info = {}
166        page = requests.get(video_url)
167        soup = BeautifulSoup(page.text, "html.parser")
168        if page.text.find("Page you are looking for does not exist",0) != -1:
169            raise PageBrowseInvalid
170
171        # Get the rating
172        hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
173        if(hvrating is not None):
174            video_page_info['rating'] = hvrating.text.strip()
175    
176        # Get the Duration of video
177        video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
178        if(video_duration is not None):
179            video_page_info['video_duration'] = video_duration.nextSibling.strip()
180
181        # Get the number of views
182        hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
183        if(hvvideo_views is not None):
184            video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
185
186        # get the date it was added
187        hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
188        if(hvdate_added is not None):
189            video_page_info['date_added'] = hvdate_added.nextSibling.strip()
190
191        # Get the description
192        hvdescription = soup.find("div", attrs={"class": "description"})
193        if(hvdescription is not None):
194            video_page_info['description'] = hvdescription.text.strip()
195
196        # Get the tags
197        tag_list = []
198        tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
199        if(tags_area is not None):
200            tags = tags_area.parent.find_all("a")
201            for tag in tags:
202                if(tag.text.find(";") != -1):
203                    print("When stripping a video there was a tag with a semi colon")
204                    print("Video URL: " + video_url)
205                    raise TagNameContainsSemicolon
206
207                tag_list.append(tag.text)
208            string_tag_list = ";".join(tag_list)
209            video_page_info['tags'] = string_tag_list
210
211        return video_page_info
212    
213    def get_page_url(self, page_number):
214        return self.strip_page_root_url + "page" + str(page_number) + ".html"
215
216
217# Variables
218save_path = "/media/james/Bad Stuff/Video Strip/"
219strip_page_root_url = "https://horriblevideos.com/"
220
221# strippy = VidyaStripper(strip_page_root_url)
222# for meta_page_number in range(start_page, end_page):
223#     for link in strippy.get_page_browse_links(meta_page_number):
224#         video_page_dat = strippy.get_video_info(link['video_page_url'])
225#         merged_dict = {**link, **video_page_dat}
226
227#         print("Updating Meta For: " + (link['video_id']) + " → " + json.dumps(merged_dict))
228#         dby.change_video_meta(link['video_id'], batch_id, merged_dict)
229
230
231################################################
232# Pool object that grabs the video information #
233################################################
234def get_vidya(url):
235    pass
236
237####################################################
238# Pool Object that gets the list of videos to grab #
239####################################################
240def get_page_meta(x):
241    meta_page_number, batch_id = x
242
243    # Create our own stripper object
244    strippy = VidyaStripper(strip_page_root_url)
245
246    print("Getting page: ", meta_page_number)
247
248    try:
249        page_links = strippy.get_page_browse_links(meta_page_number)
250    except PageBrowseInvalid as err:
251        print("Page Number " + str(err.page_number) + " is invalid")
252
253    # print(page_links)
254    for link in page_links:
255        # print("   - " + link['video_page_url'])
256        video_page_dat = strippy.get_video_info(link['video_page_url'])
257        merged_dict = {**link, **video_page_dat}
258
259        dby = VidyaDatabase(save_path)
260        dby.change_video_meta(link['video_id'], batch_id, merged_dict)
261
262    return "Updated page " + str(meta_page_number)
263
264
265###################################
266# Main entry point of the program #
267###################################
268if __name__ == '__main__':
269    d = datetime.utcnow()
270    batch_id = calendar.timegm(d.utctimetuple())
271
272    start_page = 200
273    do_pages = 400
274
275    # Generate a list of stuff to work on
276    datas = []
277    
278    for page_number in range(start_page, start_page + do_pages):
279        datas.append((page_number, batch_id))
280    
281    # Create a pool of 5 threads to get all the data of a specified page number
282    with Pool(40) as p:
283       p.map(get_page_meta, datas)
284