CYVBvmeP

· 5 years ago · Dec 25, 2020, 03:22 AM
1#!/usr/bin/env python3
2
3import os, bs4
4from selenium import webdriver
5from pathlib import Path
6
7class WebBackup:
8
9    options = webdriver.chrome.options.Options()
10    webdriver = None
11    cache_dir = None
12    queue = set()
13
14    def fetch_webdriver(self, url, filename):
15        """Uses Selenium WebDriver to fetch a login-restricted Voat URL, with local caching."""
16        # Return from local cache if it exists.  Create cache directory if needed.
17        if not self.cache_dir:
18            self.cache_dir = Path(os.getenv("HOME")) / '.cache/voat'
19        filename = self.cache_dir / filename
20        if os.path.isdir(self.cache_dir):
21            if os.path.isfile(filename):
22                #print("Cached ", filename)
23                return filename.read_text()
24        else:
25            os.makedirs(self.cache_dir)
26        # Init WebDriver if first time it's needed:
27        if not self.webdriver:
28            self.webdriver = webdriver.Chrome(options=self.options)
29            os.system('read -sn 1 -p "Log into Voat, then press any key to continue..."')
30            print()
31        # Fetch URL via WebDriver, save local cache, return HTML:
32        print("Fetching ", url)
33        self.webdriver.get(url)
34        webdriver.support.ui.WebDriverWait(self.webdriver, 15)
35        filename.write_text(self.webdriver.page_source)
36        return self.webdriver.page_source
37
38
39class VoatBackup(WebBackup):
40
41    def queue_user_submissions(self, user):
42        page = 0
43        while True:
44            url = 'https://voat.co/u/%s/submissions?page=%d' % (user, page)
45            filename = '%s-submissions-%d.html' % (user, page)
46            html = self.fetch(url, filename)
47            soup = bs4.BeautifulSoup(html, features='lxml')
48            links = soup.find_all("a", class_="comments")
49            for link in links:
50                href = 'https://voat.co' + link.get_attribute_list('href')[0]
51                self.queue.add(href)
52            if len(links) != 25: break
53            page += 1
54
55    def queue_user_comments(self, user):
56        page = 0
57        while True:
58            url = 'https://voat.co/u/%s/comments?page=%d' % (user, page)
59            filename = '%s-comments-%d.html' % (user, page)
60            html = self.fetch(url, filename)
61            soup = bs4.BeautifulSoup(html, features='lxml')
62            links = soup.find_all("a", string="permalink")
63            for link in links:
64                href = 'https://voat.co' + link.get_attribute_list('href')[0]
65                self.queue.add(href + '/1000000')
66                self.queue.add('/'.join(href.split('/')[:-1]))
67            if len(links) != 25: break
68            page += 1
69        print("Have %d submission URLs!" % len(self.queue))
70
71    def save_queue(self):
72        # TODO: automate calls to these external tools (all Python-based)...
73        filename = self.cache_dir / 'queue'
74        filename.write_text('\n'.join(self.queue))
75        print ('Wrote URL list to %s, but gotta run the tools manually...' )
76        #
77        # STEP 1 - local WARC dump via https://github.com/ArchiveTeam/grab-site
78        #          (follow pyenv instructions).  Sorta works, but skips 500 errs...
79        # os.system('grab-site --1 --input-file %s' % filename)
80        #
81        # STEP 2 - `pip install -U archiveis` - their API doesn't work right now...
82        # for url in self.queue: os.system('archiveis '+ url)
83        # 
84        # STEP 3 - `pip install -U wayback-machine-archiver` - kinda works, but
85        #          must keep trying to fill error gaps, voat.co goes up and down...
86        #os.system('archiver --file %s --log INFO --rate-limit-wait 0' % filename)
87
88
89if __name__ == '__main__':
90    vb = VoatBackup()
91    vb.options.binary_location = '/bin/brave'
92    # I'm "off the grid", so I gotta do this through SSH proxy to a mobile phone (termux):
93    vb.options.arguments.append('--proxy-server=socks5://127.0.0.1:8123')
94    vb.queue_user_submissions('libman')
95    vb.queue_user_comments('libman')
96    vb.save_queue()
97
98
99
100