· 5 years ago · Dec 25, 2020, 03:22 AM
1#!/usr/bin/env python3
2
3import os, bs4
4from selenium import webdriver
5from pathlib import Path
6
7class WebBackup:
8
9 options = webdriver.chrome.options.Options()
10 webdriver = None
11 cache_dir = None
12 queue = set()
13
14 def fetch_webdriver(self, url, filename):
15 """Uses Selenium WebDriver to fetch a login-restricted Voat URL, with local caching."""
16 # Return from local cache if it exists. Create cache directory if needed.
17 if not self.cache_dir:
18 self.cache_dir = Path(os.getenv("HOME")) / '.cache/voat'
19 filename = self.cache_dir / filename
20 if os.path.isdir(self.cache_dir):
21 if os.path.isfile(filename):
22 #print("Cached ", filename)
23 return filename.read_text()
24 else:
25 os.makedirs(self.cache_dir)
26 # Init WebDriver if first time it's needed:
27 if not self.webdriver:
28 self.webdriver = webdriver.Chrome(options=self.options)
29 os.system('read -sn 1 -p "Log into Voat, then press any key to continue..."')
30 print()
31 # Fetch URL via WebDriver, save local cache, return HTML:
32 print("Fetching ", url)
33 self.webdriver.get(url)
34 webdriver.support.ui.WebDriverWait(self.webdriver, 15)
35 filename.write_text(self.webdriver.page_source)
36 return self.webdriver.page_source
37
38
39class VoatBackup(WebBackup):
40
41 def queue_user_submissions(self, user):
42 page = 0
43 while True:
44 url = 'https://voat.co/u/%s/submissions?page=%d' % (user, page)
45 filename = '%s-submissions-%d.html' % (user, page)
46 html = self.fetch(url, filename)
47 soup = bs4.BeautifulSoup(html, features='lxml')
48 links = soup.find_all("a", class_="comments")
49 for link in links:
50 href = 'https://voat.co' + link.get_attribute_list('href')[0]
51 self.queue.add(href)
52 if len(links) != 25: break
53 page += 1
54
55 def queue_user_comments(self, user):
56 page = 0
57 while True:
58 url = 'https://voat.co/u/%s/comments?page=%d' % (user, page)
59 filename = '%s-comments-%d.html' % (user, page)
60 html = self.fetch(url, filename)
61 soup = bs4.BeautifulSoup(html, features='lxml')
62 links = soup.find_all("a", string="permalink")
63 for link in links:
64 href = 'https://voat.co' + link.get_attribute_list('href')[0]
65 self.queue.add(href + '/1000000')
66 self.queue.add('/'.join(href.split('/')[:-1]))
67 if len(links) != 25: break
68 page += 1
69 print("Have %d submission URLs!" % len(self.queue))
70
71 def save_queue(self):
72 # TODO: automate calls to these external tools (all Python-based)...
73 filename = self.cache_dir / 'queue'
74 filename.write_text('\n'.join(self.queue))
75 print ('Wrote URL list to %s, but gotta run the tools manually...' )
76 #
77 # STEP 1 - local WARC dump via https://github.com/ArchiveTeam/grab-site
78 # (follow pyenv instructions). Sorta works, but skips 500 errs...
79 # os.system('grab-site --1 --input-file %s' % filename)
80 #
81 # STEP 2 - `pip install -U archiveis` - their API doesn't work right now...
82 # for url in self.queue: os.system('archiveis '+ url)
83 #
84 # STEP 3 - `pip install -U wayback-machine-archiver` - kinda works, but
85 # must keep trying to fill error gaps, voat.co goes up and down...
86 #os.system('archiver --file %s --log INFO --rate-limit-wait 0' % filename)
87
88
89if __name__ == '__main__':
90 vb = VoatBackup()
91 vb.options.binary_location = '/bin/brave'
92 # I'm "off the grid", so I gotta do this through SSH proxy to a mobile phone (termux):
93 vb.options.arguments.append('--proxy-server=socks5://127.0.0.1:8123')
94 vb.queue_user_submissions('libman')
95 vb.queue_user_comments('libman')
96 vb.save_queue()
97
98
99
100