· 7 years ago · Jan 29, 2019, 01:10 PM
1#!user/bin/evn/python
2#coding: utf-8
3
4import requests
5import json
6import sys
7import time
8'''
9reloading sys for utf8 encoding is for Python 2.7
10This line should be removed for Python 3
11In Python 3, we need to specify encoding when open a fileÂ
12f = open("file.csv", encoding='utf-8')
13'''
14reload(sys)
15sys.setdefaultencoding('utf8')
16
17class FacebookScraper:
18Â Â Â '''
19Â Â FacebookScraper class to scrape facebook info
20Â Â '''
21
22   def __init__(self, token):
23     self.token = token
24
25Â Â Â @staticmethod
26   def convert_to_epochtime(date_string):
27Â Â Â Â Â '''Enter date_string in 2000-01-01 format and convert to epochtime'''
28Â Â Â Â Â try:
29      epoch = int(time.mktime(time.strptime(date_string, '%Y-%m-%d')))
30       return epoch
31     except ValueError:
32Â Â Â Â Â Â Â print('Invalid string format. Make sure to use %Y-%m-%d')
33Â Â Â Â Â Â quit()
34
35   def get_feed_data(self, target_page, offset, fields, json_path, date_string):
36Â Â Â Â Â """
37Â Â Â Â This method will get the feed data
38Â Â Â Â """
39    url = "https://graph.facebook.com/v2.10/{}/feed".format(target_page)
40    param = dict()
41Â Â Â Â param["access_token"]Â =Â self.token
42Â Â Â Â param["limit"]Â =Â "100"
43Â Â Â Â param["offset"]Â =Â offset
44Â Â Â Â param["fields"]Â =Â fields
45Â Â Â Â param["since"]Â =Â self.convert_to_epochtime(date_string)
46
47    r = requests.get(url, param)
48    data = json.loads(r.text)
49    f = open(json_path, "w")
50    f.write(json.dumps(data, indent=4))
51Â Â Â Â Â print("json file has been generated")
52
53Â Â Â Â f.close()
54
55     return data
56Â Â Â
57   def create_table(self, list_rows, file_path, page_name, table_name):
58Â Â Â Â Â '''This method will create a table according to header and table name'''
59
60     if table_name == "feed" :
61      header = ["page_name", "id", "type", "created_time", "message", "name",\
62       "description", "actions_link", "actions_name", "share_count",\
63       "comment_count", "like_count"]
64     elif table_name == "likes":
65      header = ["page_name", "post_id", "user_id", "name"]
66     elif table_name == "comments":
67      header = ["page_name", "post_id", "created_time", "message",\
68       "user_id", "name", "message_id"]
69Â Â Â Â Â else:
70Â Â Â Â Â Â Â print("Specified table name is not valid.")
71Â Â Â Â Â Â quit()
72
73     file = open(file_path, 'w')
74Â Â Â Â Â file.write(','.join(header)Â +Â '\n')
75     for i in list_rows:
76Â Â Â Â Â Â Â file.write('"'Â + page_name +Â '",')
77       for j in range(len(i)):
78        row_string = ''
79         if j < len(i) -1 :
80          row_string += '"' + str(i[j]).replace('"', '').replace('\n', '') + '"' + ','
81Â Â Â Â Â Â Â Â Â else:
82          row_string += '"' + str(i[j]).replace('"', '').replace('\n', '') + '"' + '\n'
83Â Â Â Â Â Â Â Â Â file.write(row_string)
84Â Â Â Â Â file.close()
85     print("Generated {} table csv File for {}".format(table_name, page_name))
86
87   def convert_feed_data(self, response_json_list):
88Â Â Â Â Â '''This method takes response json data and convert to csv'''
89    list_all = []
90     for response_json in response_json_list:
91      data = response_json["data"]
92
93       for i in range(len(data)):
94        list_row = []
95        row = data[i]
96         id = row["id"]
97Â Â Â Â Â Â Â Â Â try:
98           type = row["type"]
99         except KeyError:
100           type = ""
101Â Â Â Â Â Â Â Â Â try:
102          created_time = row["created_time"]
103         except KeyError:
104          created_time = ""
105Â Â Â Â Â Â Â Â Â try:
106          message = row["message"]
107         except KeyError:
108          message = ""
109Â Â Â Â Â Â Â Â Â try:
110          name = row["name"]
111         except KeyError:
112          name = ""
113Â Â Â Â Â Â Â Â Â try:
114          description = row["description"]
115         except KeyError:
116          description = ""
117Â Â Â Â Â Â Â Â Â try:
118          actions_link = row["actions"][0]["link"]
119         except KeyError:
120          actions_link = ""
121Â Â Â Â Â Â Â Â Â try:
122          actions_name = row["actions"][0]["name"]
123         except KeyError:
124          actions_name = ""
125Â Â Â Â Â Â Â Â Â try:
126          share_count = row["shares"]["count"]
127         except KeyError:
128          share_count = ""
129Â Â Â Â Â Â Â Â Â try:
130          comment_count = row["comments"]["summary"]["total_count"]
131         except KeyError:
132          comment_count = ""
133Â Â Â Â Â Â Â Â Â try:
134          like_count = row["likes"]["summary"]["total_count"]
135         except KeyError:
136          like_count = ""
137Â Â Â Â Â Â Â Â Â
138        list_row.extend((id, type, created_time, message, name, \
139        description, actions_link, actions_name, share_count, comment_count, like_count))
140Â Â Â Â Â Â Â Â list_all.append(list_row)
141Â Â Â Â Â
142     return list_all
143Â Â Â
144   def convert_likes_data(self, response_json_list):
145Â Â Â Â Â '''This will get the list of people who liked post,Â
146Â Â Â Â which can be joined to the feed table by post_id. '''
147    list_all = []
148     for response_json in response_json_list:
149      data = response_json["data"]
150Â Â Â Â Â Â Â # like_list = []
151       for i in range(len(data)):
152        likes_count = 0
153        row = data[i]
154        post_id = row["id"]
155Â Â Â Â Â Â Â Â Â try:
156          like_count = row["likes"]["summary"]["total_count"]
157         except KeyError:
158          like_count = 0
159         if like_count > 0:
160          likes = row["likes"]["data"]
161           for like in likes:
162            row_list = []
163            user_id = like["id"]
164            name = like["name"]
165            row_list.extend((post_id, user_id, name))
166Â Â Â Â Â Â Â Â Â Â Â Â list_all.append(row_list)
167Â Â Â Â Â Â Â Â Â # Check if the next link exists
168Â Â Â Â Â Â Â Â Â try:
169          next_link = row["likes"]["paging"]["next"]
170         except KeyError:
171          next_link = None
172Â Â Â Â Â Â Â Â Â Â Â continue
173
174         if next_link is not None:
175          r = requests.get(next_link.replace("limit=25", "limit=100"))
176          likes_data = json.loads(r.text)
177           while True:
178             for i in range(len(likes_data["data"])):
179              row_list = []
180              row = likes_data["data"][i]
181              user_id = row["id"]
182              name = row["name"].encode("latin1", "ignore")
183              row_list.extend((post_id, user_id, name))
184Â Â Â Â Â Â Â Â Â Â Â Â Â Â list_all.append(row_list)
185Â Â Â Â Â Â Â Â Â Â Â Â Â try:
186              next = likes_data["paging"]["next"]
187              r = requests.get(next.replace("limit=25", "limit=100"))
188              likes_data = json.loads(r.text)
189             except KeyError:
190Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â print("Likes for the post {} completed".format(post_id))
191Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break
192     return list_all
193
194   def convert_comments_data(self, response_json_list):
195Â Â Â Â Â '''This will get the list of people who commented on the post,Â
196Â Â Â Â which can be joined to the feed table by post_id. '''
197    list_all = []
198     for response_json in response_json_list:
199      data = response_json["data"]
200Â Â Â Â Â Â Â # like_list = []
201       for i in range(len(data)):
202        likes_count = 0
203        row = data[i]
204        post_id = row["id"]
205Â Â Â Â Â Â Â Â Â try:
206          comment_count = row["comments"]["summary"]["total_count"]Â
207         except KeyError:
208          comment_count = 0
209         if comment_count > 0:
210          comments = row["comments"]["data"]
211           for comment in comments:
212            row_list = []
213            created_time = comment["created_time"]
214            message = comment["message"].encode('latin1', 'ignore')
215            user_id = comment["from"]["id"]
216            name = comment["from"]["name"].encode('latin1', 'ignore')
217            message_id = comment["id"]
218            row_list.extend((post_id, created_time, message,\
219            user_id, name, message_id))
220Â Â Â Â Â Â Â Â Â Â Â Â list_all.append(row_list)
221Â Â Â Â Â Â Â Â Â
222Â Â Â Â Â Â Â Â Â # Check if the next link exists
223Â Â Â Â Â Â Â Â Â try:
224          next_link = row["comments"]["paging"]["next"]
225         except KeyError:
226          next_link = None
227Â Â Â Â Â Â Â Â Â Â Â continue
228Â Â Â Â Â Â Â Â Â
229         if next_link is not None:
230          r = requests.get(next_link.replace("limit=25", "limit=100"))
231          comments_data = json.loads(r.text)
232           while True:
233             for i in range(len(comments_data["data"])):
234              row_list = []
235              comment = comments_data["data"][i]
236              created_time = comment["created_time"]
237              message = comment["message"].encode('latin1', 'ignore')
238              user_id = comment["from"]["id"]
239              name = comment["from"]["name"].encode('latin1', 'ignore')
240              message_id = comment["id"]
241              row_list.extend((post_id, created_time, message,\
242              user_id, name, message_id))
243Â Â Â Â Â Â Â Â Â Â Â Â Â Â list_all.append(row_list)
244Â Â Â Â Â Â Â Â Â Â Â Â Â try:
245              next = comments_data["paging"]["next"]
246              r = requests.get(next.replace("limit=25", "limit=100"))
247              comments_data = json.loads(r.text)
248             except KeyError:
249Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â print("Comments for the post {} completed".format(post_id))
250Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break
251     return list_all
252
253if __name__ == "__main__":
254
255  token_input = sys.argv[1]
256  target_page_input = sys.argv[2]
257  json_path_input = sys.argv[3]
258  csv_feed_path_input = sys.argv[4]
259  csv_likes_path_input = sys.argv[5]
260  csv_comments_path_input = sys.argv[6]
261  date_since_input = sys.argv[7]
262Â Â Â # Input check
263Â Â Â print(token_input)
264Â Â Â print(target_page_input)
265  field_input = 'id,created_time,name,message,comments.summary(true),\
266Â Â shares,type,published,link,likes.summary(true),actions,place,tags,\
267Â Â object_attachment,targeting,feed_targeting,scheduled_publish_time,\
268Â Â backdated_time,description'
269
270  fb = FacebookScraper(token_input)
271
272  offset = 0
273  json_list = []
274   while True:
275    path = str(offset) + "_" + json_path_inputÂ
276Â Â Â Â Â try:
277      data = fb.get_feed_data(target_page_input, str(offset), field_input, path, date_since_input)
278      check = data['data']
279       if (len(check) >= 100):
280Â Â Â Â Â Â Â Â json_list.append(data)
281Â Â Â Â Â Â Â Â offset +=Â 100
282Â Â Â Â Â Â Â else:
283Â Â Â Â Â Â Â Â json_list.append(data)
284Â Â Â Â Â Â Â Â Â print("End of loop for obtaining more than 100 feed records.")
285Â Â Â Â Â Â Â Â Â break
286     except KeyError:
287Â Â Â Â Â Â Â print("Error with get request.")
288Â Â Â Â Â Â quit()
289
290  feed_table_list = fb.convert_feed_data(json_list)
291  likes_table_list = fb.convert_likes_data(json_list)
292  comments_table_list = fb.convert_comments_data(json_list)
293Â Â Â # Record check
294Â Â Â print(feed_table_list[0])
295Â Â Â print(likes_table_list[0])
296Â Â Â print(comments_table_list[0])
297
298  fb.create_table(feed_table_list, csv_feed_path_input, target_page_input, "feed")
299  fb.create_table(likes_table_list, csv_likes_path_input, target_page_input, "likes")
300  fb.create_table(comments_table_list, csv_comments_path_input, target_page_input, "comments")