cBtrkpXB

· 6 years ago · Jan 21, 2020, 06:30 PM
1import re
2import requests
3from time import *
4from collections import *
5
6subDict = defaultdict(int)
7pat = re.compile(r"\/?r\/ ?((\w|_){3,21})")
8unsorted = []
9
10#span of time to grab posts is 12 hours, starting from now
11timestep = 12*60*60
12before = int(time())
13
14sub = "JustUnsubbed"
15
16grabbing = True
17hits = 0
18while(grabbing):
19    #Adjust timeframe to be the 12 hours prior to "before" timestamp
20    after = before - timestep
21    #If the api has timed me out, wait a few seconds before querying again
22    while("Too Many Requests" in (r := requests.get(f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&sort=desc&sort_type=created_utc&after={after}&before={before}&size=1000")).text):
23        print("Timed out")
24        sleep(5)
25    
26    posts = r.json()["data"]
27
28    if(len(posts) == 0):
29        grabbing = False
30        break
31
32    for post in posts:
33        title = post["title"].lower()
34        regexResult = pat.search(title)
35
36        #If there is a subreddit (prefixed by r/, with some optional syntax sugar to catch as many subs as possible)
37        if(regexResult):
38            #Print the grabbed subreddit name, because words scrolling by on a console is cool B)
39            print(regexResult.group(0))
40
41            #Ignore r/justunsubbed because a significant number of people post "I just r/unsubbed from r/<subreddit>", causing error in the data
42            if(regexResult.group(1) != 'justunsubbed'):
43                subDict[regexResult.group(1)] += 1
44            hits += 1
45        else:
46            #Otherwise, the sub can't be immediately parsed. Log it
47            print(f"MISSED: {title}")
48            unsorted.append(title)
49
50        print("----------------------")
51
52    #Set the end of the timeframe to the oldest post in the list
53    before = posts[-1]["created_utc"]
54
55
56print(f"got {hits} matches and {len(unsorted)} misses")
57
58#Compose the dict based on the edit distance of a given key to it's neigbors
59delete = []
60for referenceKey in subDict.keys():
61    if(referenceKey not in delete):
62        #Grab all close keys that haven't already been composed and aren't the referencekey
63        matches = get_close_matches(referenceKey,[key for key in subDict.keys() if ((key != referenceKey) and (key not in delete))],cutoff=.8)
64        print(matches)
65        for match in matches:
66            answer = input(f"Is {match} probably {referenceKey}?")
67            #If any input is grabbed, mark the matched key for deletion and compose it into the reference
68            if(answer != ""):
69                delete.append(match)
70                subDict[referenceKey] += subDict[match]
71
72#Build a new dict without the deleted items and replace it
73newDict = {}
74for k,v in subDict.items():
75    if(k not in delete):
76        newDict[k] = v
77subDict = newDict
78
79#Sort the dict and print it to a csv for excel
80with open("out.csv", "w") as f:
81    for k in sorted(subDict, key=subDict.get, reverse=True):
82        f.write(f"{k},{subDict[k]}\n")
83
84#Print out all of the failures
85with open("fails.txt", "w", encoding="utf8") as f:
86    f.writelines([l + '\n' for l in unsorted])