KH6Rgf9W

· 6 years ago · Sep 04, 2019, 05:16 PM
1{
2 "cells": [
3  {
4   "cell_type": "markdown",
5   "metadata": {},
6   "source": [
7    "# 1.1: Pull tweets"
8   ]
9  },
10  {
11   "cell_type": "markdown",
12   "metadata": {},
13   "source": [
14    "1. Pull tweets based hashtag and timestamp as the training set. \n",
15    "2. Pull tweets based on coordinate and hashtag as the testing set.  \n",
16    "(Please add keys, label and number of twitter based on below comments accordingly.)"
17   ]
18  },
19  {
20   "cell_type": "code",
21   "execution_count": 10,
22   "metadata": {},
23   "outputs": [],
24   "source": [
25    "from TwitterAPI import TwitterAPI\n",
26    "import json\n",
27    "\n",
28    "import pandas as pd\n",
29    "import time"
30   ]
31  },
32  {
33   "cell_type": "code",
34   "execution_count": 4,
35   "metadata": {},
36   "outputs": [],
37   "source": [
38    "# put twitter keys here\n",
39    "consumer_key=''\n",
40    "consumer_secret=''\n",
41    "access_token_key=''\n",
42    "access_token_secret=''"
43   ]
44  },
45  {
46   "cell_type": "code",
47   "execution_count": 1,
48   "metadata": {},
49   "outputs": [],
50   "source": [
51    "# put params here\n",
52    "\n",
53    "# pull tweets- #campfire/ #carrfire/ #hurricanhavey\n",
54    "# params = {\n",
55    "#     'query':'lang:en (#campfire)',\n",
56    "#     'fromDate':'201811080000',\n",
57    "#     'toDate':'201911090000'\n",
58    "# }\n",
59    "\n",
60    "# params = {\n",
61    "#     'query':'lang:en (#carrfire)',\n",
62    "#     'fromDate':'201807230000',\n",
63    "#     'toDate':'201807240000'\n",
64    "# }\n",
65    "\n",
66    "# params = {\n",
67    "#     'query':'lang:en (#hurricaneharvey)',\n",
68    "#     'fromDate':'201708170000',\n",
69    "#     'toDate':'201709020000'\n",
70    "# }\n",
71    "\n",
72    "\n",
73    "# crateria example for testting set:\n",
74    "# 'lang:en point_radius:[-95.36 29.76 25mi] (#carrfire)'"
75   ]
76  },
77  {
78   "cell_type": "code",
79   "execution_count": 6,
80   "metadata": {},
81   "outputs": [],
82   "source": [
83    "# functions used to pull call API multiple times\n",
84    "\n",
85    "api = TwitterAPI(consumer_key=consumer_key,\n",
86    "                 consumer_secret=consumer_secret,\n",
87    "                 access_token_key=access_token_key,\n",
88    "                 access_token_secret=access_token_secret)\n",
89    "\n",
90    "def _get_tweet_count(found_tweets):\n",
91    "    return len(set(tweet['id'] for tweet in found_tweets))\n",
92    "\n",
93    "def search(api, params, next_key=None):\n",
94    "    PRODUCT = 'fullarchive'\n",
95    "    LABEL = '' # This is specific to your application\n",
96    "               # i.e. whatever label you set for your Dev environment, and is case sensitive\n",
97    "    if next_key:\n",
98    "        params['next']= next_key\n",
99    "    r = api.request('tweets/search/%s/:%s' % (PRODUCT, LABEL), params)\n",
100    "    print(r.json().keys())\n",
101    "    try:\n",
102    "        results = r.json()['results']\n",
103    "    except:\n",
104    "        print(r.json())\n",
105    "        return None, None\n",
106    "    try:\n",
107    "        next_new = r.json()['next']\n",
108    "    except:\n",
109    "        return results, None\n",
110    "    return results, next_new\n",
111    "\n",
112    "def search_tweets(api, params, counts):\n",
113    "#     results = []\n",
114    "    global results\n",
115    "    MAX_COUNTER = 50 # number of times running api\n",
116    "    query_counter = 0\n",
117    "    tweet_counter = 0\n",
118    "    next_key = ''\n",
119    "    while len(results) < counts and query_counter <= MAX_COUNTER:\n",
120    "        result, next_key = search(api, params, next_key)\n",
121    "        print('got %s results' % str(len(result)))\n",
122    "        print('Next: %s' % next_key)\n",
123    "        results.extend(result)\n",
124    "        query_counter += 1\n",
125    "        if _get_tweet_count(results) == tweet_counter:\n",
126    "            break\n",
127    "        if next_key == None:\n",
128    "            break\n",
129    "        tweet_counter = _get_tweet_count(results)\n",
130    "        time.sleep(2)\n",
131    "    return results"
132   ]
133  },
134  {
135   "cell_type": "code",
136   "execution_count": 11,
137   "metadata": {
138    "scrolled": false
139   },
140   "outputs": [],
141   "source": [
142    "# use above functions\n",
143    "# 4000 is the number of the tweets we gonna get\n",
144    "results = []\n",
145    "test_results = search_tweets(api, params, 4000)"
146   ]
147  },
148  {
149   "cell_type": "code",
150   "execution_count": 362,
151   "metadata": {},
152   "outputs": [],
153   "source": [
154    "# save test_results in json format\n",
155    "# with open('./data/campfire_tweets.json', 'w') as f:\n",
156    "#     json.dump(test_results, f)"
157   ]
158  },
159  {
160   "cell_type": "code",
161   "execution_count": 363,
162   "metadata": {},
163   "outputs": [],
164   "source": [
165    "# Info pulled from tweets\n",
166    "USEFUL_INFO_KEYS = ['created_at', 'text', 'longitute', 'latitute', 'retweet_count', 'favorite_count', 'hashtags']\n",
167    "\n",
168    "def extract_tweets(tweets):\n",
169    "    result = []\n",
170    "    for tweet in tweets:\n",
171    "        info = {}\n",
172    "        for key in USEFUL_INFO_KEYS:\n",
173    "            if key == 'hashtags':\n",
174    "                info[key] = [i['text'] for i in tweet['entities']['hashtags']]\n",
175    "            elif key == 'longitute':\n",
176    "                try:\n",
177    "                    info[key] = tweet['coordinates']['coordinates'][0]\n",
178    "                except TypeError:\n",
179    "                    pass\n",
180    "            elif key == 'latitute':\n",
181    "                try:\n",
182    "                    info[key] = tweet['coordinates']['coordinates'][1]\n",
183    "                except TypeError:\n",
184    "                    pass\n",
185    "            else:\n",
186    "                info[key] = tweet[key]\n",
187    "        result.append(info)\n",
188    "    return result"
189   ]
190  },
191  {
192   "cell_type": "code",
193   "execution_count": 364,
194   "metadata": {},
195   "outputs": [],
196   "source": [
197    "# convert results into dataframe\n",
198    "extracted_test = extract_tweets(test_results)\n",
199    "df_campfire = pd.DataFrame(extracted_test)"
200   ]
201  },
202  {
203   "cell_type": "code",
204   "execution_count": 2,
205   "metadata": {},
206   "outputs": [],
207   "source": [
208    "# save dataframe as csv\n",
209    "# modify the filename each time when you pull on different params\n",
210    "df_campfire.to_csv('./data/campfire_tweets.csv',index=False)"
211   ]
212  },
213  {
214   "cell_type": "code",
215   "execution_count": null,
216   "metadata": {},
217   "outputs": [],
218   "source": []
219  }
220 ],
221 "metadata": {
222  "kernelspec": {
223   "display_name": "Python 3",
224   "language": "python",
225   "name": "python3"
226  },
227  "language_info": {
228   "codemirror_mode": {
229    "name": "ipython",
230    "version": 3
231   },
232   "file_extension": ".py",
233   "mimetype": "text/x-python",
234   "name": "python",
235   "nbconvert_exporter": "python",
236   "pygments_lexer": "ipython3",
237   "version": "3.7.3"
238  }
239 },
240 "nbformat": 4,
241 "nbformat_minor": 2
242}