· 6 years ago · Jan 21, 2020, 06:30 PM
1import re
2import requests
3from time import *
4from collections import *
5
6subDict = defaultdict(int)
7pat = re.compile(r"\/?r\/ ?((\w|_){3,21})")
8unsorted = []
9
10#span of time to grab posts is 12 hours, starting from now
11timestep = 12*60*60
12before = int(time())
13
14sub = "JustUnsubbed"
15
16grabbing = True
17hits = 0
18while(grabbing):
19 #Adjust timeframe to be the 12 hours prior to "before" timestamp
20 after = before - timestep
21 #If the api has timed me out, wait a few seconds before querying again
22 while("Too Many Requests" in (r := requests.get(f"https://api.pushshift.io/reddit/search/submission/?subreddit={sub}&sort=desc&sort_type=created_utc&after={after}&before={before}&size=1000")).text):
23 print("Timed out")
24 sleep(5)
25
26 posts = r.json()["data"]
27
28 if(len(posts) == 0):
29 grabbing = False
30 break
31
32 for post in posts:
33 title = post["title"].lower()
34 regexResult = pat.search(title)
35
36 #If there is a subreddit (prefixed by r/, with some optional syntax sugar to catch as many subs as possible)
37 if(regexResult):
38 #Print the grabbed subreddit name, because words scrolling by on a console is cool B)
39 print(regexResult.group(0))
40
41 #Ignore r/justunsubbed because a significant number of people post "I just r/unsubbed from r/<subreddit>", causing error in the data
42 if(regexResult.group(1) != 'justunsubbed'):
43 subDict[regexResult.group(1)] += 1
44 hits += 1
45 else:
46 #Otherwise, the sub can't be immediately parsed. Log it
47 print(f"MISSED: {title}")
48 unsorted.append(title)
49
50 print("----------------------")
51
52 #Set the end of the timeframe to the oldest post in the list
53 before = posts[-1]["created_utc"]
54
55
56print(f"got {hits} matches and {len(unsorted)} misses")
57
58#Compose the dict based on the edit distance of a given key to it's neigbors
59delete = []
60for referenceKey in subDict.keys():
61 if(referenceKey not in delete):
62 #Grab all close keys that haven't already been composed and aren't the referencekey
63 matches = get_close_matches(referenceKey,[key for key in subDict.keys() if ((key != referenceKey) and (key not in delete))],cutoff=.8)
64 print(matches)
65 for match in matches:
66 answer = input(f"Is {match} probably {referenceKey}?")
67 #If any input is grabbed, mark the matched key for deletion and compose it into the reference
68 if(answer != ""):
69 delete.append(match)
70 subDict[referenceKey] += subDict[match]
71
72#Build a new dict without the deleted items and replace it
73newDict = {}
74for k,v in subDict.items():
75 if(k not in delete):
76 newDict[k] = v
77subDict = newDict
78
79#Sort the dict and print it to a csv for excel
80with open("out.csv", "w") as f:
81 for k in sorted(subDict, key=subDict.get, reverse=True):
82 f.write(f"{k},{subDict[k]}\n")
83
84#Print out all of the failures
85with open("fails.txt", "w", encoding="utf8") as f:
86 f.writelines([l + '\n' for l in unsorted])