· 7 years ago · Oct 23, 2018, 12:02 PM
1import sqlite3
2import json
3from datetime import datetime
4import sys
5
6#Choose the apropriate encoding
7reload(sys)
8sys.setdefaultencoding('UTF-8')
9
10#Get all data as a vector and then insert them all
11sqlTransaction = []
12FILE = "RC_2015-01"
13
14connection = sqlite3.connect('{}.db'.format(FILE))
15cursor = connection.cursor()
16
17#Create a SQL table where we insert data
18def createTable():
19 cursor.execute("""CREATE TABLE IF NOT EXISTS parentReply
20 (parentID TEXT PRIMARY KEY, commentID TEXT UNIQUE, parent TEXT, comment TEXT, subReddit TEXT, unix INT, score INT)""")
21
22#Format data in order to be tokenized
23def formatData(data):
24 data = data.replace("\n", " newlinechar ").replace("\r", " newlinechar ").replace('"', "'")
25 return data
26
27#For every comment we need to know it's parent body so we can know, for every answer, its question
28def findParentBody(pid):
29 try:
30 sql = "SELECT comment FROM parentReply WHERE commentID = '{}' LIMIT 1".format(pid)
31 cursor.execute(sql)
32 result = cursor.fetchone()
33
34 if result != None:
35 return result[0]
36 else: return False
37 except Exception as e:
38 print("findParentBody", str(e))
39 return False
40
41#Check if there is already an answer to the reply with score greater than the current one
42def findExistingScore(pid):
43 try:
44 sql = "SELECT score FROM parentReply WHERE parentID = '{}' LIMIT 1".format(pid)
45 cursor.execute(sql)
46 result = cursor.fetchone()
47
48 if result != None:
49 return result[0]
50 else:
51 return False
52 except Exception as e:
53 print("findExistingScore", str(e))
54 return False
55
56#If the comment is still available and a good candidate as answer
57def acceptable(data):
58 if len(data.split(' ')) > 50 or len(data) < 1 :
59 return False
60 elif len(data) > 1000:
61 return False
62 elif data == '[deleted]':
63 return False
64 elif data == '[removed]':
65 return False
66 else:
67 return True
68
69#We update the database and set the new value for the reply
70def insertReplyAndReplace(commentID, parentID, parentData, body, subReddit, createdUTC, score):
71 try:
72 sql = """UPDATE parentReply SET parentID = "{}", commentID = "{}", parent = "{}",
73 comment = "{}", subReddit = "{}", unix = {}, score = {} WHERE parentID = {};""".format(parentID, commentID, parentData, body, subReddit, int(createdUTC), score, parentID)
74 transactionBuild(sql)
75 except Exception as e:
76 print("insertReplyAndReplace", str(e))
77
78#We insert the reply and the question. We got a valid pair
79def insertReply(commentID, parentID, parentData, body, subReddit, createdUTC, score):
80 try:
81 sql = """INSERT INTO parentReply (parentID, commentID, parent,
82 comment, subReddit, unix, score) VALUES ("{}", "{}", "{}", "{}", "{}", {}, {});""".format(parentID, commentID, parentData, body, subReddit, int(createdUTC), score)
83 transactionBuild(sql)
84 except Exception as e:
85 print("insertReply", str(e))
86
87#We insert a new question
88def insertQuestion(commentID, parentID, body, subReddit, createdUTC, score):
89 try:
90 sql = """INSERT INTO parentReply (parentID, commentID, comment,
91 subReddit, unix, score) VALUES ("{}", "{}", "{}", "{}", {}, {});""".format(parentID, commentID, body, subReddit, int(createdUTC), score)
92 transactionBuild(sql)
93 except Exception as e:
94 print("insertQuestion", str(e))
95
96#Start executing commands after more so that we don't have to begin a transaction every time. This way we save a lot of time
97def transactionBuild(sql):
98 global sqlTransaction
99 sqlTransaction.append(sql)
100 if len(sqlTransaction) > 1000:
101 cursor.execute("BEGIN TRANSACTION")
102 for command in sqlTransaction:
103 try:
104 cursor.execute(command)
105 except Exception as e:
106 pass
107 connection.commit()
108 sqlTransaction = []
109
110if __name__ == '__main__':
111 createTable()
112 rows = 0
113 pairedRows = 0
114
115 with open(FILE, buffering=1000) as file:
116 for row in file:
117 rows += 1
118 row = json.loads(row)
119
120 #Extract data from json
121 parentID = row['parent_id'].split('_')[1]
122 commentID = row['id']
123 body = formatData(row['body'])
124 createdUTC = row['created_utc']
125 score = row['score']
126 subReddit = row['subreddit']
127 parentData = findParentBody(parentID)
128
129 #Insertion logic
130 if score >= 2:
131 commentScore = findExistingScore(parentID)
132 if commentScore:
133 if score > commentScore and acceptable(body):
134 #We have to replace the lower score comment with the better one
135 insertReplyAndReplace(commentID, parentID, parentData, body, subReddit, createdUTC, score)
136 else:
137 if acceptable(body):
138 if parentData:
139 #If the current comment is a reply and the question doest have any answer
140 insertReply(commentID, parentID, parentData, body, subReddit, createdUTC, score)
141 pairedRows += 1
142 else:
143 #If it's actually a question
144 insertQuestion(commentID, parentID, body, subReddit, createdUTC, score)
145
146 #See status sometimes
147 if rows % 1000 == 0:
148 print("Total rows read: {} , paired rows: {}".format(rows, pairedRows))
149
150 #After longer time, if we see that there are questions without reply, we just delete them cuz they are slowing the DB
151 if rows > 0:
152 if rows % 1000000 == 0:
153 print("Start cleaning up")
154 sql = "DELETE from parentReply WHERE parent is NULL"
155 cursor.execute(sql)
156 connection.commit()
157 cursor.execute("VACUUM")
158 connection.commit()