rLP8ehmh

· 6 years ago · May 27, 2019, 12:26 PM
1# ----- CONFIGURE YOUR EDITOR TO USE 4 SPACES PER TAB ----- #
2import sys, os
3sys.path.append(os.path.join(os.path.split(os.path.abspath(__file__))[0], 'lib'))
4import pymysql as db
5import settings
6
7
8def connection():
9    ''' User this function to create your connections '''
10    con = db.connect(
11        settings.mysql_host,
12        settings.mysql_user,
13        settings.mysql_passwd,
14        settings.mysql_schema)
15
16    return con
17
18def sortFourth(val):
19    return val[3]
20
21def Classify_Create_Final(topn,category_list):
22    final=[]
23    # Add table header
24    tmpt=("title","class","subclass", "weightsum")
25    final.append(tmpt)
26
27    # For each article list containing the first-topn categories of the article
28    for article_block in category_list:
29        # Add all the category data at first and then if there are blanks fill spaces so that for every article 7 rows are displayed (optional)
30        for i in range(0,len(article_block)):
31            tmpt=(article_block[i][0],article_block[i][1],article_block[i][2],article_block[i][3])
32            final.append(tmpt)
33        # Add (topn-category amount) blank values
34        for i in range(len(article_block),topn):
35            tmpt=("----","----","----","----")
36            final.append(tmpt)
37
38
39
40
41    return final
42
43
44def classify(topn):
45
46    # Create a new connection
47    con=connection()
48
49    # Create a cursor on the connection
50    cur=con.cursor()
51
52    # Query that returns the title and the summary for every article without a category (the first is with join , the second without join)
53    # que="SELECT art.title,art.summary FROM articles art LEFT JOIN article_has_class ahc ON (ahc.articles_id=art.id) WHERE (ahc.articles_id IS NULL)"
54    que="SELECT title , summary FROM articles WHERE articles.id NOT IN (SELECT articles.id FROM articles , article_has_class WHERE (article_has_class.articles_id=articles.id))"
55    cur.execute(que)
56
57    data=cur.fetchall()
58
59    art_category=[]
60
61    # For each article that is returned by the query , take the summary(sum) and split it into single words
62    for article_row in data:
63        category_list=[]
64        title=article_row[0]
65        sum=article_row[1]
66        words=sum.split()
67        # For each word run the following query which returns all categories that are linked with it (and their weights)
68        for w in words:
69            que="SELECT * FROM classes WHERE (term=%s)"
70            input=(w)
71            row_c=cur.execute(que,input)
72
73            if row_c > 0:
74                category_res=cur.fetchall()
75                # For each category returned by the query prepare the data and create a temporary list
76                for category_row in category_res:
77                    class1=category_row[0]
78                    subclass=category_row[1]
79                    term=category_row[2]
80                    weight=float(category_row[3])
81                    tmpt=[title,class1,subclass,weight]
82
83                    # Search if the temporary list's category already exists in the general array
84                    found=False
85                    for tmpcat in category_list:
86                        cl=tmpcat[1]
87                        subcl=tmpcat[2]
88                        # If it exists , simply add the weight of the current temporary list to the one that is inside the general array
89                        if cl==class1 and subcl==subclass:
90                            tmpcat[3]=weight+tmpcat[3]
91                            found=True
92                            break
93
94                    # If it doesn't exist , insert a new category
95                    if found==False:
96                        category_list.append(tmpt)
97
98        # After all the insertions , sort the general array by the 'weightsum' in descending order
99        category_list.sort(key = sortFourth , reverse=True)
100        # Slice the general array into its first 'topn' elements and insert it into the (almost) final 3D matrix
101        art_category.append(category_list[:int(topn)])
102
103
104
105    final_tuple=Classify_Create_Final(int(topn),art_category)
106    return final_tuple
107
108
109def updateweight(class1,subclass,weight):
110
111   # Create a new connection
112
113    con=connection()
114    w1=int(weight)
115    # Create a cursor on the connection
116    cur=con.cursor()
117
118    que= "SELECT * FROM classes  WHERE class = %s AND subclass=%s and weight > %d " % (class1,subclass,weight)
119    cur.execute(que,input)
120    data=cur.fetchall()
121    #
122    if data > 0 :
123        tmpw=(class_row[3]-weight)/2
124        class_row[3]=class_row[3]-tmpw
125        return [("result",), "ok"]
126    else:
127        return [("result",), "error"]
128
129
130
131def selectTopNClasses(fromdate, todate, n):
132
133    # Create a new connection
134
135    con=connection()
136
137    # Create a cursor on the connection
138    cur=con.cursor()
139    test= int(n)
140
141    que=""" SELECT article_has_class.class , article_has_class.subclass, count(article_has_class.class)
142            FROM article_has_class , articles
143            WHERE articles.id=article_has_class.articles_id IN (
144                SELECT articles.id
145                FROM articles , article_has_class , classes
146                WHERE  ( classes.class = article_has_class.class AND
147	                     classes.subclass = article_has_class.subclass AND
148                         article_has_class.articles_id = articles.id AND
149                         articles.date > %s AND articles.date < %s)
150                                                                )
151            GROUP BY article_has_class.class
152            ORDER BY count(class) DESC
153            LIMIT %d"""
154
155    input=(fromdate , todate , n )
156    cur.execute(que,input)
157    answer=cur.fetchall()
158    return [("class","subclass", "count"),answer]
159
160def countArticles(class1,subclass):
161    # Create a new connection
162    con=connection()
163    # Create a cursor on the connection
164    cur=con.cursor()
165
166    query="""SELECT COUNT(*)
167            FROM (SELECT art.title
168	        from classes c
169	        inner join article_has_class ahc on (ahc.class=c.class AND ahc.subclass=c.subclass)
170	        inner join articles art on (art.id=ahc.articles_id)
171	        where (c.class=%s AND c.subclass=%s)
172	        group by art.id) as t1"""
173
174    input=(class1,subclass)
175
176    cur.execute(query,input)
177
178    ans=cur.fetchall()
179
180    return [("count",),ans[0]]
181
182
183def findSimilarArticles(articleId,n):
184    # Create a new connection
185    con=connection()
186    # Create a cursor on the connection
187    cur=con.cursor()
188    que="SELECT articles.summary FROM articles "
189    cur.execute(que)
190    data=cur.fetchall()
191    for art_row in data :
192        sum=art_row[0]
193        
194        
195
196
197    return [("articleid",),]
198
199def jaccard_similarity(query, document):
200    intersection = set(query).intersection(set(document))
201    union = set(query).union(set(document))
202    return len(intersection)/len(union)