· 4 years ago · Feb 08, 2021, 05:00 AM
1#!/usr/bin/python3
2
3# https://horriblevideos.com/page2.html
4
5from multiprocessing import Pool
6from datetime import datetime
7import calendar
8from bs4 import BeautifulSoup
9import requests
10import shutil
11import os
12import sys
13import json
14import time
15import sqlite3
16import json
17
18class PageBrowseInvalid(Exception):
19 def __init__(self, page_number):
20 self.page_number = page_number
21
22class TagNameContainsSemicolon(Exception):
23 pass
24
25# Abstracts away the database
26class VidyaDatabase:
27 class VideoMetaDoesntExist(Exception):
28 pass
29
30 # Fields used by this program
31 # id
32 # meta_batch_id
33 # video_download_batch_id
34
35 # Fields in the table "hvideo" which contain data meta info from the site
36 # date_added
37 # thumbnail_filename
38 # thumbnail_url
39 # video_filename
40 # video_title
41 # video_views
42 # rating
43 # video_id
44 # video_duration
45 # tags
46 # description
47
48 def __init__(self, save_path):
49 self.db_name = save_path + "horrible.db"
50 self.conn = sqlite3.connect(self.db_name)
51 self.c = self.conn.cursor()
52
53 def __del__(self):
54 self.conn.commit()
55 self.conn.close()
56
57 def get_video_meta(self, video_id):
58 params = (video_id,)
59 entries = self.c.execute('SELECT * FROM hvideo WHERE video_id=?', params)
60 return entries.fetchone()
61
62
63 # Use this function to create or update meta data
64 def change_video_meta(self, video_id, this_batch_id, video_params):
65 sanatized_params = {}
66 # Strip any fields in video_params which aren't allowed to be updated
67 valid_fields = ["video_page_url", "date_added", "thumbnail_filename", "thumbnail_url", "video_filename", "video_title", "video_views", "rating", "video_duration", "tags", "description"]
68
69
70 for key in video_params:
71 if(key in valid_fields):
72 sanatized_params[key] = video_params[key]
73
74 # Update the batch ID
75 # TODO: Make sure this batch doesn't match any other ones
76 sanatized_params['meta_batch_id'] = this_batch_id
77
78 # Determine if we need to create a new record
79 if(self.get_video_meta(video_id) == None):
80 self.c.execute('INSERT INTO hvideo(video_id) VALUES (?)', (video_id,))
81 self.conn.commit()
82
83 self._update_video_meta(video_id, this_batch_id, sanatized_params)
84
85 # Internal use, for when the record already exists
86 def _update_video_meta(self, video_id, this_batch_id, video_params):
87 # UPDATE Customers
88 # SET ContactName = 'Alfred Schmidt', City= 'Frankfurt'
89 # WHERE CustomerID = 1;
90
91 # self.c.execute('UPDATES hvideo SET meta_batch_id = ? WHERE video_id=?', (this_batch_id, video_id,))
92 # print("Horse: " + param, video_params[param])
93 # self.c.execute('UPDATE hvideo SET ? = ? WHERE video_id=?', (param, video_params[param], video_id,))
94
95 # Build our SQL Statement
96 param_list = []
97 sql = "UPDATE hvideo SET "
98 current_param_index = 0
99 for param in video_params:
100 current_param_index = current_param_index + 1
101 sql = sql + param + " = ?"
102 if(current_param_index < len(video_params)):
103 sql = sql + ", "
104 else:
105 sql = sql + " "
106
107 param_list.append(video_params[param])
108
109 sql = sql + "WHERE video_id = ?"
110
111 param_list.append(video_id)
112
113 self.c.execute(sql, param_list)
114 self.conn.commit()
115
116
117
118class VidyaStripper:
119 def __init__(self, strip_page_root_url):
120 self.strip_page_root_url = strip_page_root_url
121
122 def get_page_browse_links(self, page_number):
123 # Array of all the info we strip from the Page View
124 horrible_data_items = []
125
126 # Get the data
127 page_url = self.get_page_url(page_number)
128 page = requests.get(page_url)
129 soup = BeautifulSoup(page.text, "html.parser")
130 if page.text.find("Sorry, no results were found.",0) != -1:
131 raise PageBrowseInvalid(page_number)
132
133 # Parse out the elements with videos
134 horrible_items = soup.find_all("div", attrs={"class": "col-item"})
135
136 # Iterate the elements and pull the data
137 for horrible_item in horrible_items:
138 # Reset this stored data
139 horrible_data_item = {}
140
141 # Get the Thumbnail
142 thumbnail_urls = horrible_item.find_all("img", attrs={"class": "responsive"})
143 for thumbnail_url in thumbnail_urls:
144 horrible_data_item['thumbnail_url'] = thumbnail_url.attrs['src']
145
146 # Get the URL to the page showing the video
147 video_page_url = horrible_item.find("a").attrs['href']
148 horrible_data_item['video_page_url'] = video_page_url
149
150 # Get the ID of the video based on the URL
151 dash_pos = video_page_url.rfind("-")
152 video_id = video_page_url[dash_pos+1:-5]
153 horrible_data_item['video_id'] = video_id
154
155 # Get the title of the video
156 video_title = horrible_item.find("span", attrs={"class": "title"}).text
157 horrible_data_item['video_title'] = video_title
158
159 # Add the item onto the list
160 horrible_data_items.append(horrible_data_item)
161
162 return horrible_data_items
163
164 def get_video_info(self, video_url):
165 video_page_info = {}
166 page = requests.get(video_url)
167 soup = BeautifulSoup(page.text, "html.parser")
168 if page.text.find("Page you are looking for does not exist",0) != -1:
169 raise PageBrowseInvalid
170
171 # Get the rating
172 hvrating = soup.find("li", attrs={"class": "li-vote-percent"})
173 if(hvrating is not None):
174 video_page_info['rating'] = hvrating.text.strip()
175
176 # Get the Duration of video
177 video_duration = soup.find("i", attrs={"class": "fa-clock-o"})
178 if(video_duration is not None):
179 video_page_info['video_duration'] = video_duration.nextSibling.strip()
180
181 # Get the number of views
182 hvvideo_views = soup.find("i", attrs={"class": "fa-eye"})
183 if(hvvideo_views is not None):
184 video_page_info['video_views'] = int(hvvideo_views.nextSibling.strip())
185
186 # get the date it was added
187 hvdate_added = soup.find("i", attrs={"class": "fa-calendar"})
188 if(hvdate_added is not None):
189 video_page_info['date_added'] = hvdate_added.nextSibling.strip()
190
191 # Get the description
192 hvdescription = soup.find("div", attrs={"class": "description"})
193 if(hvdescription is not None):
194 video_page_info['description'] = hvdescription.text.strip()
195
196 # Get the tags
197 tag_list = []
198 tags_area = soup.find("i", attrs={"class": "fa fa-tags"})
199 if(tags_area is not None):
200 tags = tags_area.parent.find_all("a")
201 for tag in tags:
202 if(tag.text.find(";") != -1):
203 print("When stripping a video there was a tag with a semi colon")
204 print("Video URL: " + video_url)
205 raise TagNameContainsSemicolon
206
207 tag_list.append(tag.text)
208 string_tag_list = ";".join(tag_list)
209 video_page_info['tags'] = string_tag_list
210
211 return video_page_info
212
213 def get_page_url(self, page_number):
214 return self.strip_page_root_url + "page" + str(page_number) + ".html"
215
216
217# Variables
218save_path = "/media/james/Bad Stuff/Video Strip/"
219strip_page_root_url = "https://horriblevideos.com/"
220
221# strippy = VidyaStripper(strip_page_root_url)
222# for meta_page_number in range(start_page, end_page):
223# for link in strippy.get_page_browse_links(meta_page_number):
224# video_page_dat = strippy.get_video_info(link['video_page_url'])
225# merged_dict = {**link, **video_page_dat}
226
227# print("Updating Meta For: " + (link['video_id']) + " → " + json.dumps(merged_dict))
228# dby.change_video_meta(link['video_id'], batch_id, merged_dict)
229
230
231################################################
232# Pool object that grabs the video information #
233################################################
234def get_vidya(url):
235 pass
236
237####################################################
238# Pool Object that gets the list of videos to grab #
239####################################################
240def get_page_meta(x):
241 meta_page_number, batch_id = x
242
243 # Create our own stripper object
244 strippy = VidyaStripper(strip_page_root_url)
245
246 print("Getting page: ", meta_page_number)
247
248 try:
249 page_links = strippy.get_page_browse_links(meta_page_number)
250 except PageBrowseInvalid as err:
251 print("Page Number " + str(err.page_number) + " is invalid")
252
253 # print(page_links)
254 for link in page_links:
255 # print(" - " + link['video_page_url'])
256 video_page_dat = strippy.get_video_info(link['video_page_url'])
257 merged_dict = {**link, **video_page_dat}
258
259 dby = VidyaDatabase(save_path)
260 dby.change_video_meta(link['video_id'], batch_id, merged_dict)
261
262 return "Updated page " + str(meta_page_number)
263
264
265###################################
266# Main entry point of the program #
267###################################
268if __name__ == '__main__':
269 d = datetime.utcnow()
270 batch_id = calendar.timegm(d.utctimetuple())
271
272 start_page = 200
273 do_pages = 400
274
275 # Generate a list of stuff to work on
276 datas = []
277
278 for page_number in range(start_page, start_page + do_pages):
279 datas.append((page_number, batch_id))
280
281 # Create a pool of 5 threads to get all the data of a specified page number
282 with Pool(40) as p:
283 p.map(get_page_meta, datas)
284