· 6 years ago · May 27, 2019, 12:26 PM
1# ----- CONFIGURE YOUR EDITOR TO USE 4 SPACES PER TAB ----- #
2import sys, os
3sys.path.append(os.path.join(os.path.split(os.path.abspath(__file__))[0], 'lib'))
4import pymysql as db
5import settings
6
7
8def connection():
9 ''' User this function to create your connections '''
10 con = db.connect(
11 settings.mysql_host,
12 settings.mysql_user,
13 settings.mysql_passwd,
14 settings.mysql_schema)
15
16 return con
17
18def sortFourth(val):
19 return val[3]
20
21def Classify_Create_Final(topn,category_list):
22 final=[]
23 # Add table header
24 tmpt=("title","class","subclass", "weightsum")
25 final.append(tmpt)
26
27 # For each article list containing the first-topn categories of the article
28 for article_block in category_list:
29 # Add all the category data at first and then if there are blanks fill spaces so that for every article 7 rows are displayed (optional)
30 for i in range(0,len(article_block)):
31 tmpt=(article_block[i][0],article_block[i][1],article_block[i][2],article_block[i][3])
32 final.append(tmpt)
33 # Add (topn-category amount) blank values
34 for i in range(len(article_block),topn):
35 tmpt=("----","----","----","----")
36 final.append(tmpt)
37
38
39
40
41 return final
42
43
44def classify(topn):
45
46 # Create a new connection
47 con=connection()
48
49 # Create a cursor on the connection
50 cur=con.cursor()
51
52 # Query that returns the title and the summary for every article without a category (the first is with join , the second without join)
53 # que="SELECT art.title,art.summary FROM articles art LEFT JOIN article_has_class ahc ON (ahc.articles_id=art.id) WHERE (ahc.articles_id IS NULL)"
54 que="SELECT title , summary FROM articles WHERE articles.id NOT IN (SELECT articles.id FROM articles , article_has_class WHERE (article_has_class.articles_id=articles.id))"
55 cur.execute(que)
56
57 data=cur.fetchall()
58
59 art_category=[]
60
61 # For each article that is returned by the query , take the summary(sum) and split it into single words
62 for article_row in data:
63 category_list=[]
64 title=article_row[0]
65 sum=article_row[1]
66 words=sum.split()
67 # For each word run the following query which returns all categories that are linked with it (and their weights)
68 for w in words:
69 que="SELECT * FROM classes WHERE (term=%s)"
70 input=(w)
71 row_c=cur.execute(que,input)
72
73 if row_c > 0:
74 category_res=cur.fetchall()
75 # For each category returned by the query prepare the data and create a temporary list
76 for category_row in category_res:
77 class1=category_row[0]
78 subclass=category_row[1]
79 term=category_row[2]
80 weight=float(category_row[3])
81 tmpt=[title,class1,subclass,weight]
82
83 # Search if the temporary list's category already exists in the general array
84 found=False
85 for tmpcat in category_list:
86 cl=tmpcat[1]
87 subcl=tmpcat[2]
88 # If it exists , simply add the weight of the current temporary list to the one that is inside the general array
89 if cl==class1 and subcl==subclass:
90 tmpcat[3]=weight+tmpcat[3]
91 found=True
92 break
93
94 # If it doesn't exist , insert a new category
95 if found==False:
96 category_list.append(tmpt)
97
98 # After all the insertions , sort the general array by the 'weightsum' in descending order
99 category_list.sort(key = sortFourth , reverse=True)
100 # Slice the general array into its first 'topn' elements and insert it into the (almost) final 3D matrix
101 art_category.append(category_list[:int(topn)])
102
103
104
105 final_tuple=Classify_Create_Final(int(topn),art_category)
106 return final_tuple
107
108
109def updateweight(class1,subclass,weight):
110
111 # Create a new connection
112
113 con=connection()
114 w1=int(weight)
115 # Create a cursor on the connection
116 cur=con.cursor()
117
118 que= "SELECT * FROM classes WHERE class = %s AND subclass=%s and weight > %d " % (class1,subclass,weight)
119 cur.execute(que,input)
120 data=cur.fetchall()
121 #
122 if data > 0 :
123 tmpw=(class_row[3]-weight)/2
124 class_row[3]=class_row[3]-tmpw
125 return [("result",), "ok"]
126 else:
127 return [("result",), "error"]
128
129
130
131def selectTopNClasses(fromdate, todate, n):
132
133 # Create a new connection
134
135 con=connection()
136
137 # Create a cursor on the connection
138 cur=con.cursor()
139 test= int(n)
140
141 que=""" SELECT article_has_class.class , article_has_class.subclass, count(article_has_class.class)
142 FROM article_has_class , articles
143 WHERE articles.id=article_has_class.articles_id IN (
144 SELECT articles.id
145 FROM articles , article_has_class , classes
146 WHERE ( classes.class = article_has_class.class AND
147 classes.subclass = article_has_class.subclass AND
148 article_has_class.articles_id = articles.id AND
149 articles.date > %s AND articles.date < %s)
150 )
151 GROUP BY article_has_class.class
152 ORDER BY count(class) DESC
153 LIMIT %d"""
154
155 input=(fromdate , todate , n )
156 cur.execute(que,input)
157 answer=cur.fetchall()
158 return [("class","subclass", "count"),answer]
159
160def countArticles(class1,subclass):
161 # Create a new connection
162 con=connection()
163 # Create a cursor on the connection
164 cur=con.cursor()
165
166 query="""SELECT COUNT(*)
167 FROM (SELECT art.title
168 from classes c
169 inner join article_has_class ahc on (ahc.class=c.class AND ahc.subclass=c.subclass)
170 inner join articles art on (art.id=ahc.articles_id)
171 where (c.class=%s AND c.subclass=%s)
172 group by art.id) as t1"""
173
174 input=(class1,subclass)
175
176 cur.execute(query,input)
177
178 ans=cur.fetchall()
179
180 return [("count",),ans[0]]
181
182
183def findSimilarArticles(articleId,n):
184 # Create a new connection
185 con=connection()
186 # Create a cursor on the connection
187 cur=con.cursor()
188 que="SELECT articles.summary FROM articles "
189 cur.execute(que)
190 data=cur.fetchall()
191 for art_row in data :
192 sum=art_row[0]
193
194
195
196
197 return [("articleid",),]
198
199def jaccard_similarity(query, document):
200 intersection = set(query).intersection(set(document))
201 union = set(query).union(set(document))
202 return len(intersection)/len(union)