· 7 years ago · Nov 20, 2018, 03:14 AM
1{
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": null,
6 "metadata": {},
7 "outputs": [],
8 "source": [
9 "from collections import defaultdict\n",
10 "import logging\n",
11 "import matplotlib.pyplot \n",
12 "import nltk, string\n",
13 "import networkx as nx\n",
14 "import numpy\n",
15 "from pprint import pprint\n",
16 "import sqlite3\n",
17 "from sklearn.feature_extraction.text import TfidfVectorizer\n",
18 "import wikipediaapi\n",
19 "# from gensim import corpora, models, similarities\n"
20 ]
21 },
22 {
23 "cell_type": "code",
24 "execution_count": null,
25 "metadata": {},
26 "outputs": [],
27 "source": [
28 "matplotlib.rcParams['figure.figsize'] = [50, 50]\n",
29 "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
30 ]
31 },
32 {
33 "cell_type": "code",
34 "execution_count": null,
35 "metadata": {},
36 "outputs": [],
37 "source": [
38 "category_name = \"Category:Medicine\""
39 ]
40 },
41 {
42 "cell_type": "code",
43 "execution_count": null,
44 "metadata": {},
45 "outputs": [],
46 "source": [
47 "nltk.download('punkt')\n",
48 "\n",
49 "stemmer = nltk.stem.porter.PorterStemmer()\n",
50 "remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)\n",
51 "\n",
52 "def stem_tokens(tokens):\n",
53 " return [stemmer.stem(item) for item in tokens]\n",
54 "\n",
55 "'''remove punctuation, lowercase, stem'''\n",
56 "def normalize(text):\n",
57 " return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))\n",
58 "\n",
59 "vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')\n",
60 "\n",
61 "def cosine_sim(text1, text2):\n",
62 " tfidf = vectorizer.fit_transform([text1, text2])\n",
63 " return ((tfidf * tfidf.T).A)[0,1]\n"
64 ]
65 },
66 {
67 "cell_type": "code",
68 "execution_count": null,
69 "metadata": {},
70 "outputs": [],
71 "source": [
72 "class WikipediaDataset(object):\n",
73 " \"\"\"\n",
74 " Reads data from Wikipedia for a given category\n",
75 " \"\"\"\n",
76 " def __init__(self, category_name, num_pages=500, db_filename = \"wikipedia.sqlite\"):\n",
77 " self.category_name = category_name\n",
78 " self.num_pages = num_pages\n",
79 " self.db_filename = db_filename\n",
80 " self.conn = sqlite3.connect(self.db_filename, timeout=5)\n",
81 " self.cursor = self.conn.cursor()\n",
82 " self.wiki = wikipediaapi.Wikipedia('en', extract_format=wikipediaapi.ExtractFormat.WIKI)\n",
83 " \n",
84 " def setup_db_tables(self):\n",
85 " try:\n",
86 " self.cursor.executescript(''' \n",
87 " CREATE TABLE CATEGORY(\n",
88 " id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,\n",
89 " topic TEXT UNIQUE\n",
90 " );''')\n",
91 " self.conn.commit()\n",
92 " except Exception as e:\n",
93 " # Add logging or error handling here\n",
94 " print(e)\n",
95 " \n",
96 " try:\n",
97 " self.cursor.executescript('''\n",
98 " CREATE TABLE EDGES (\n",
99 " id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,\n",
100 " toNode TEXT UNIQUE,\n",
101 " fromNode TEXT UNIQUE, \n",
102 " weight REAL DEFAULT 0.0 \n",
103 " );\n",
104 " ''')\n",
105 " self.conn.commit()\n",
106 " except Exception as e:\n",
107 " # Add logging or error handling here\n",
108 " print(e)\n",
109 "\n",
110 " try:\n",
111 " self.cursor.executescript('''\n",
112 " CREATE TABLE NODE (\n",
113 " id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,\n",
114 " nodeName TEXT UNIQUE,\n",
115 " content TEXT,\n",
116 " topic TEXT\n",
117 " );\n",
118 " ''')\n",
119 " self.conn.commit()\n",
120 " except Exception as e:\n",
121 " # Add logging or error handling here\n",
122 " print(e)\n",
123 " \n",
124 " def scrape_pages(self):\n",
125 " try:\n",
126 " category_name = self.category_name\n",
127 " num_pages = self.num_pages \n",
128 " cat_id = -1\n",
129 " self.cursor.execute('''INSERT OR IGNORE INTO CATEGORY (topic) VALUES(?)''', (category_name,))\n",
130 " self.conn.commit()\n",
131 " while category_name is not None:\n",
132 " cat = self.wiki.page(category_name)\n",
133 " for c in cat.categorymembers.values():\n",
134 " if c.ns == wikipediaapi.Namespace.MAIN:\n",
135 " print(num_pages)\n",
136 " # print(c.title)\n",
137 " self.cursor.execute('''SELECT nodeName FROM NODE WHERE nodeName = ? ''', (c.title,))\n",
138 " n = self.cursor.fetchone()\n",
139 " # print(n)\n",
140 " if n is None:\n",
141 " p_wiki = self.wiki.page(c.title)\n",
142 " print('*** adding page %s' % c.title)\n",
143 " # Save content to sqlite\n",
144 " self.cursor.execute('''INSERT OR IGNORE INTO NODE (nodeName, content, topic) VALUES(?, ?, ?)''', (c.title, p_wiki.text, category_name))\n",
145 " # Save links to sqlite \n",
146 " for to in p_wiki.links:\n",
147 " self.cursor.execute('''INSERT OR IGNORE INTO EDGES (toNode, fromNode) VALUES(?, ?)''', (to, c.title))\n",
148 " self.conn.commit()\n",
149 " num_pages = num_pages - 1\n",
150 " if num_pages < 1:\n",
151 " # self.conn.commit()\n",
152 " return\n",
153 " else:\n",
154 " print(\"Found '%s' in NODE table\" % (c.title))\n",
155 " elif c.ns == wikipediaapi.Namespace.CATEGORY:\n",
156 " print(c.title)\n",
157 " self.cursor.execute('''INSERT OR IGNORE INTO CATEGORY (topic) VALUES(?)''', (c.title,))\n",
158 " self.conn.commit()\n",
159 " print(\"cat_id is %d\" % (cat_id))\n",
160 " if cat_id == -1:\n",
161 " self.cursor.execute('SELECT min(id) FROM CATEGORY')\n",
162 " cat_id = self.cursor.fetchone()[0]\n",
163 " cat_id = cat_id + 1\n",
164 " self.cursor.execute('SELECT topic FROM CATEGORY WHERE id = ? ', (cat_id,))\n",
165 " row = self.cursor.fetchone()\n",
166 " if row is None:\n",
167 " category_name = None\n",
168 " else:\n",
169 " category_name = row[0]\n",
170 " self.conn.commit()\n",
171 " except KeyboardInterrupt:\n",
172 " # self.cursor.close()\n",
173 " # self.conn.close()\n",
174 " pass\n",
175 " except Exception as e:\n",
176 " # Add logging or error handling here\n",
177 " # print(\"Exception in scraping pages...\")\n",
178 " print(e)\n",
179 " \n",
180 " def drop_tables(self):\n",
181 " try:\n",
182 " self.cursor.executescript(''' \n",
183 " DROP TABLE IF EXISTS EDGES;\n",
184 " DROP TABLE IF EXISTS CATEGORY;\n",
185 " ''')\n",
186 " self.conn.commit()\n",
187 " except Exception as e:\n",
188 " # Add logging or error handling here\n",
189 " print(e)\n",
190 " \n",
191 " def close(self):\n",
192 " try:\n",
193 " self.cursor.close()\n",
194 " self.conn.close()\n",
195 " except Exception as e:\n",
196 " # Add logging or error handling here\n",
197 " print(e)\n",
198 " "
199 ]
200 },
201 {
202 "cell_type": "code",
203 "execution_count": null,
204 "metadata": {},
205 "outputs": [],
206 "source": [
207 "w = WikipediaDataset(category_name)\n",
208 "w.drop_tables()\n",
209 "w.setup_db_tables()\n",
210 "w.scrape_pages()\n",
211 "w.close()"
212 ]
213 },
214 {
215 "cell_type": "code",
216 "execution_count": null,
217 "metadata": {},
218 "outputs": [],
219 "source": []
220 },
221 {
222 "cell_type": "code",
223 "execution_count": null,
224 "metadata": {},
225 "outputs": [],
226 "source": [
227 "class SemanticWeights(object):\n",
228 " \"\"\"\n",
229 " Compute weights of edges for nodes\n",
230 " \"\"\"\n",
231 " def __init__(self, db_filename = \"wikipedia.sqlite\", max_nodes = 10):\n",
232 " self.db_filename = db_filename\n",
233 " self.conn = sqlite3.connect(self.db_filename, timeout=5)\n",
234 " self.cursor = self.conn.cursor()\n",
235 " self.max_nodes = max_nodes\n",
236 " \"\"\"\n",
237 " # gensim related\n",
238 " self.documents = [] \n",
239 " self.dictionary = None\n",
240 " self.corpus = None\n",
241 " self.lsi = None\n",
242 " \"\"\"\n",
243 " \n",
244 " def setup_lsi_model(self):\n",
245 " self.cursor.execute('''SELECT nodeName, content FROM NODE''')\n",
246 " for node in self.cursor.fetchall():\n",
247 " self.documents.append(node[1])\n",
248 " stoplist = set('for a of the and to in'.split())\n",
249 " texts = [[word for word in document.lower().split() if word not in stoplist]\n",
250 " for document in self.documents]\n",
251 " frequency = defaultdict(int)\n",
252 " for text in texts:\n",
253 " for token in text:\n",
254 " frequency[token] += 1\n",
255 " texts = [[token for token in text if frequency[token] > 1]\n",
256 " for text in texts]\n",
257 " pprint(texts)\n",
258 " self.dictionary = corpora.Dictionary(texts)\n",
259 " self.corpus = [self.dictionary.doc2bow(text) for text in texts]\n",
260 " corpora.MmCorpus.serialize('/tmp/wiki.mm', self.corpus)\n",
261 " self.lsi = models.LsiModel(self.corpus, id2word=self.dictionary, num_topics=2)\n",
262 " self.index = similarities.MatrixSimilarity(self.lsi[self.corpus])\n",
263 " \n",
264 " def update_weights(self):\n",
265 " self.cursor.execute('''SELECT nodeName, content FROM NODE''') \n",
266 " for fromNode in self.cursor.fetchall():\n",
267 " self.cursor.execute('''SELECT toNode FROM EDGES WHERE fromNode = ? ''', (fromNode[0],))\n",
268 " fromContent = fromNode[1]\n",
269 " num_nodes = self.max_nodes\n",
270 " for toNode in self.cursor.fetchall(): \n",
271 " self.cursor.execute('''SELECT content FROM NODE WHERE nodeName = ? ''', (toNode[0],))\n",
272 " n = self.cursor.fetchone()\n",
273 " semantic_distance = 0.0\n",
274 " if n is not None:\n",
275 " toContent = n[0]\n",
276 " semantic_distance = cosine_sim(fromContent, toContent)\n",
277 " print(\"Updating weights for edge between '%s' and '%s' to %f\" % (fromNode[0], toNode[0], semantic_distance))\n",
278 " # Can substitute with LSA (gensim) based cosine similarity or doc2vec based cosine similarity\n",
279 " \"\"\"\n",
280 " # gensim related\n",
281 " from_vec = self.dictionary.doc2bow(fromContent.lower().split())\n",
282 " from_vec_lsi = self.lsi[from_vec]\n",
283 " to_vec = self.dictionary.doc2bow(toContent.lower().split())\n",
284 " to_vec_lsi = self.lsi[to_vec]\n",
285 " \"\"\"\n",
286 " # Compute semantic distance\n",
287 " self.cursor.execute('''UPDATE EDGES set weight = ? WHERE fromNode = ? and toNode = ? ''', (semantic_distance, fromNode[0], toNode[0]))\n",
288 " self.conn.commit()\n",
289 " num_nodes = num_nodes - 1\n",
290 " else:\n",
291 " print(\"Weights not updated for edge between '%s' and '%s'\" % (fromNode[0], toNode[0]))\n",
292 " if num_nodes < 1:\n",
293 " break"
294 ]
295 },
296 {
297 "cell_type": "code",
298 "execution_count": null,
299 "metadata": {},
300 "outputs": [],
301 "source": [
302 "sem = SemanticWeights()\n",
303 "sem.update_weights()"
304 ]
305 },
306 {
307 "cell_type": "code",
308 "execution_count": null,
309 "metadata": {},
310 "outputs": [],
311 "source": []
312 },
313 {
314 "cell_type": "code",
315 "execution_count": null,
316 "metadata": {},
317 "outputs": [],
318 "source": [
319 "class NetworkGraph(object):\n",
320 " \"\"\"\n",
321 " Plots the network graph for downloaded Wikipedia data\n",
322 " \"\"\"\n",
323 " def __init__(self, db_filename = \"wikipedia.sqlite\", max_nodes = 10):\n",
324 " self.db_filename = db_filename\n",
325 " self.conn = sqlite3.connect(self.db_filename, timeout=5)\n",
326 " self.cursor = self.conn.cursor()\n",
327 " self.graph = nx.DiGraph() # nx.Graph()\n",
328 " self.max_nodes = max_nodes\n",
329 " \n",
330 " def build_network(self):\n",
331 " self.cursor.execute('''SELECT nodeName FROM NODE''')\n",
332 " for fromNode in self.cursor.fetchall():\n",
333 " self.cursor.execute('''SELECT toNode, weight FROM EDGES WHERE fromNode = ? ''', (fromNode[0],))\n",
334 " numnodes = self.max_nodes\n",
335 " for toNode in self.cursor.fetchall():\n",
336 " self.graph.add_edge(fromNode[0], toNode[0])\n",
337 " self.graph[fromNode[0]][toNode[0]]['weight'] = 10*toNode[1]\n",
338 " numnodes = numnodes - 1\n",
339 " if numnodes < 1:\n",
340 " break\n",
341 "\n",
342 " def plot_network(self, node_size = 20, node_color = 'b', edge_width = 1, \n",
343 " font_size = 8, font_color = 'g'): \n",
344 " G = nx.spring_layout(self.graph)\n",
345 " nx.draw_networkx_nodes(self.graph, G, node_size = node_size, \n",
346 " node_color = node_color)\n",
347 " edges = self.graph.edges()\n",
348 " weights = [(1.0+self.graph[u][v]['weight']) for u,v in edges]\n",
349 " nx.draw_networkx_edges(self.graph, G, width=weights) \n",
350 " nx.draw_networkx_labels(self.graph, G, font_size = font_size, \n",
351 " font_family='sans-serif', font_color=font_color)\n"
352 ]
353 },
354 {
355 "cell_type": "code",
356 "execution_count": null,
357 "metadata": {},
358 "outputs": [],
359 "source": [
360 "network = NetworkGraph()\n",
361 "network.build_network()\n",
362 "network.plot_network()"
363 ]
364 },
365 {
366 "cell_type": "code",
367 "execution_count": null,
368 "metadata": {},
369 "outputs": [],
370 "source": [
371 "network.plot_network0()"
372 ]
373 },
374 {
375 "cell_type": "code",
376 "execution_count": null,
377 "metadata": {},
378 "outputs": [],
379 "source": []
380 }
381 ],
382 "metadata": {
383 "kernelspec": {
384 "display_name": "Python 3",
385 "language": "python",
386 "name": "python3"
387 },
388 "language_info": {
389 "codemirror_mode": {
390 "name": "ipython",
391 "version": 3
392 },
393 "file_extension": ".py",
394 "mimetype": "text/x-python",
395 "name": "python",
396 "nbconvert_exporter": "python",
397 "pygments_lexer": "ipython3",
398 "version": "3.7.1"
399 }
400 },
401 "nbformat": 4,
402 "nbformat_minor": 2
403}