· 7 years ago · Jan 08, 2019, 07:20 PM
1# -*- coding: utf-8 -*-
2#AUTHOR: Åukasz Szadowski
3#
4
5import re
6import urllib.request
7import time
8import os
9import platform
10if platform.system() == 'Windows':
11 clear = lambda: os.system('cls')
12else:
13 clear = lambda: os.system('clear')
14
15
16# --- sql ---
17import sqlite3
18print("Doing sql stuf...")
19con = sqlite3.connect('mtg.db')
20con.row_factory = sqlite3.Row
21cur = con.cursor()
22
23cur.executescript('''
24DROP TABLE IF EXISTS cards;
25CREATE TABLE IF NOT EXISTS cards(
26 mID INTEGER PRIMARY KEY ,
27 nr TEXT ,
28 name TEXT ,
29 MC TEXT,
30 CMC INTEGER,
31 type TEXT ,
32 text TEXT,
33 flavor TEXT,
34 expantion TEXT ,
35 rarity TEXT ,
36 artist TEXT ,
37 power TEXT,
38 toughness TEXT
39)
40''')
41
42con.close()
43print("Sql stuf DONE")
44# --- url variables ---
45page_nr = 0
46
47url_p1 = "http://gatherer.wizards.com/Pages/Search/Default.aspx?output=standard&page="
48url_p2 = "&format=%5b%22Standard%22%5d"
49url_p3 = '&format=["Modern"]'
50url = url_p1 + str(page_nr) + url_p2
51
52# --- card variables ---
53
54mId = []
55
56content_id = []
57
58
59# --- funtions ---
60def doRequest(furl):
61 return urllib.request.urlopen(furl).read().decode() #furl - function url
62def getCard(number):
63 url_partial = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid="
64 url_card = url_partial + number
65 site = doRequest(url_card)
66def find_max_site_nr():
67#This function finds in site, range of sites with cards
68 rPageNr = re.compile(r'page=[\d]+')
69 rNr = re.compile(r'[\d]+')
70
71 site = doRequest(url) #pulling down site
72 _page_nr = rPageNr.findall(site) #finding page numbers
73
74 if((_page_nr.__len__()/2) > 15): #Navigation buttons are on the top and on the bottom of this site
75 nr = _page_nr[_page_nr.__len__()-1] # 1. [0 - 14] all can be shown at once
76 else: # 2. nr > 14 is hidden by "»" buton
77 nr = _page_nr[_page_nr.__len__()-2] # -2 'cuz in (1.) last element is goto [0]
78
79 nr = rNr.findall(nr) #getting number
80 return int(nr[0]) #returning it as a int(string)
81def get_cards():
82#This function is getting mID of cards on actual site
83
84 rTable = r'<a href="\.\./Card/Details\.aspx\?multiverseid=[\d]+" id='
85
86 nr = re.compile(r'[\d]+')
87 tableCard = re.compile(rTable)
88
89 site = doRequest(url)
90 cards = tableCard.findall(site)
91 multiverseId = []
92
93 for q in cards: # q result of regex
94 multiverseId.append( nr.findall( q ) ) # multiverseId id of card
95 for id in multiverseId: # from set of ids
96 mId.append( id[0] ) # id[0] for extracting str from list[str]
97
98
99
100def get_name_from_findall( list ):
101 if( list.__len__()-1 > 0 ):
102 result1 = re.compile( r"\w[\w',\-\s]+" ).findall( list[0] )
103 result2 = re.compile( r"\w[\w',\-\s]+" ).findall( list[1] )
104 return [ result1[ result1.__len__()-1 ] , result2[ result2.__len__()-1 ]]
105 else:
106 result1 = re.compile( r"\w[\w',\-\s]+" ).findall( list[0] )
107 return [ result1[ result1.__len__()-1 ] , None]
108def remove_tag_sub(stringq):
109# subfunction which removes html tags and refill with alternative text
110 #mana any type
111 stringq = get_card_text(r'alt="Tap"',">{TAP}<",stringq)
112 stringq = get_card_text(r'alt="1"',">{1}<",stringq)
113 stringq = get_card_text(r'alt="2"',">{2}<",stringq)
114 stringq = get_card_text(r'alt="3"',">{3}<",stringq)
115 stringq = get_card_text(r'alt="4"',">{4}<",stringq)
116 stringq = get_card_text(r'alt="5"',">{5}<",stringq)
117 stringq = get_card_text(r'alt="6"',">{6}<",stringq)
118 stringq = get_card_text(r'alt="7"',">{7}<",stringq)
119 stringq = get_card_text(r'alt="8"',">{8}<",stringq)
120 stringq = get_card_text(r'alt="9"',">{9}<",stringq)
121 stringq = get_card_text(r'alt="0"',">{0}<",stringq)
122 stringq = get_card_text(r'alt="10"',">{10}<",stringq)
123 #some eldrazi have more than 10 cl mana
124 stringq = get_card_text(r'alt="11"',">{11}<",stringq)
125 stringq = get_card_text(r'alt="12"',">{12}<",stringq)
126 stringq = get_card_text(r'alt="13"',">{13}<",stringq)
127 stringq = get_card_text(r'alt="14"',">{14}<",stringq)
128 stringq = get_card_text(r'alt="15"',">{15}<",stringq)
129 stringq = get_card_text(r'alt="16"',">{16}<",stringq)
130 stringq = get_card_text(r'alt="17"',">{17}<",stringq)
131 stringq = get_card_text(r'alt="18"',">{18}<",stringq)
132 #mana symbol
133 stringq = get_card_text(r'alt="Colorless"',">{Colorless}<",stringq)
134 stringq = get_card_text(r'alt="White"',">{W}<",stringq)
135 stringq = get_card_text(r'alt="Blue"',">{U}<",stringq)
136 stringq = get_card_text(r'alt="Black"',">{B}<",stringq)
137 stringq = get_card_text(r'alt="Red"',">{R}<",stringq)
138 stringq = get_card_text(r'alt="Green"',">{G}<",stringq)
139 #remove tags
140 reg = r'<[\[\]\s\w="-;:\.,/\'&\?]+>'
141 stringq = get_card_text(reg,"",stringq)
142 #other refills
143 stringq = get_card_text(r'Card Text:[\s]+',"",stringq)
144 stringq = get_card_text(r'Flavor Text:',"",stringq)
145 stringq = get_card_text(r'Expansion:',"",stringq)
146 stringq = get_card_text(r'Rarity:',"",stringq)
147 stringq = get_card_text(r'Card Number:',"",stringq)
148 stringq = get_card_text(r'Artist:',"",stringq)
149 stringq = get_card_text(r'All Sets:',"",stringq)
150 stringq = get_card_text(r'\r\n[\s]+',"",stringq)
151 return stringq
152def remove_tag(list__):
153# removeing html tags for 2 sided cards and for normal too
154 if list__.__len__() == 2:
155 list__[1] = remove_tag_sub(list__[1])
156 list__[0] = remove_tag_sub(list__[0])
157 print
158 return list__
159def get_card_text(pattern,repl,stringq):
160#I Dunno why
161 return re.sub(pattern,repl,stringq)
162def get_card_content(id):
163#Function which is responible for downloading content
164 url1 = 'http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid='
165 url = url1 + str(id)
166 site = doRequest( url )
167 print("\n"+url)
168
169 rName = re.compile(r'Card Name:</div>[\s]+<div class="value">'+r"\s+[\w',\-\s]+")
170 try:
171 names = rName.findall( site )
172 if(names.__len__() == 2):
173 print("Nie obsługuje dwustronnych kart!\r\n")
174 return
175 names = get_name_from_findall( names )
176 name = names[0]
177 except:
178 name = "Error"
179
180 rMc = re.compile( r'Mana Cost:[\s\w<>="-;:\.,/\'&\?]+Converted Mana Cost:' )
181 try:
182 mc = rMc.findall( site )
183 mc = remove_tag_sub( mc[0] )
184 mc = get_card_text(r'Mana Cost:',"",mc)
185 mc = get_card_text(r'Converted',"",mc)
186 except:
187 mc = 0
188
189 #Converted mana cost is only on one side of card
190 rCmc = re.compile( r'Converted Mana Cost:</div>[\s]+<div class="value">[\s]+[\d]+' )
191 try:
192 cmc = rCmc.findall(site)
193 cmc = re.compile( r'\d+' ).findall( cmc[0] )
194 cmc = cmc[0]
195 except:
196 cmc = 0
197
198 #types
199 rTypes = re.compile( r'Types:</div>[\s]+<div class="value">'+"[\s]+[\w',\-\s]+\w" )
200 try:
201 types = rTypes.findall( site )
202 types = get_name_from_findall( types )
203 types = types[0]
204 except:
205 types = "ErrorType"
206
207 rCT = re.compile(r'Card Text:')
208 isCardText = rCT.findall(site)
209 if(isCardText.__len__() == 1):
210 isCardText = isCardText[0]
211 else:
212 isCardText = ""
213
214 rFT = re.compile(r'Flavor Text:')
215
216 isCardFlTe = rFT.findall(site)
217
218 if(isCardFlTe.__len__() == 1):
219 isCardFlTe = isCardFlTe[0]
220 else:
221 isCardFlTe = ""
222
223 if((isCardText + isCardFlTe) == "Card Text:Flavor Text:"):
224
225 rCardText = re.compile( r'x;">(.+)</div><' )
226
227 cardText = rCardText.findall( site )
228 cardText = remove_tag( cardText )
229
230 rFlavorText = re.compile( r'Flavor Text:[\s\w—<>="-;:\.,/\'&\?]+<div class="label">' )
231
232 flavorText = rFlavorText.findall( site )
233 flavorText = remove_tag(flavorText)
234
235 cardText = cardText[0]
236 flavorText = flavorText[0]
237
238 elif((isCardText + isCardFlTe) == "Flavor Text:"):
239 cardText = "NULL"
240
241 rFlavorText = re.compile(r'x">(.+)</div></')
242
243 flavorText = rFlavorText.findall( site )
244 flavorText = remove_tag(flavorText)
245
246 flavorText = flavorText[0]
247
248 elif((isCardText + isCardFlTe) == "Card Text:"):
249 flavorText = "NULL"
250 rCardText = re.compile( r'x;">(.+)</div><' )
251
252 cardText = rCardText.findall( site )
253 cardText = remove_tag( cardText )
254 cardText = cardText[0]
255 else:
256 cardText = "NULL"
257 flavorText = "NULL"
258
259 rExpantion = re.compile( r'Expansion:</div>[\s]+<div class="value"[\[\]\s\w—<>="-;:\.,/\'&\?]+Rarity:' )
260 expantion = rExpantion.findall( site )
261 expantion = remove_tag(expantion)
262 expantion = expantion[0]
263
264 rRarity = re.compile( r'Rarity:</div>[\s]+<div class="value">[\s]+<span class=[\s\w—<>="-;:\.,/\'&\?]+Card Number:' )
265 rarity = rRarity.findall( site )
266 rarity = remove_tag(rarity)
267 rarity = rarity[0]
268
269 rCardNumber = re.compile( r'Card Number:[\s\w—<>="-;:\.,/\'&\?]+Artist:' )
270 cardNumber = rCardNumber.findall( site )
271 cardNumber = remove_tag(cardNumber)
272 cardNumber = cardNumber[0]
273
274 rArtist = re.compile( r'Artist:[\[\]\s\w—<>="-;:\.,/\'&\?]+</a>')
275 artist = rArtist.findall( site )
276 artist = remove_tag(artist)
277 artist = artist[0]
278
279 rPT = re.compile( r'[\w\*]{1,3}\s/\s[\w\*+]{1,3}' )
280 if(types == "Creature"):
281 PT = rPT.findall( site )
282
283 rP = re.compile( r'[\w\*+]{1,3}\s/' )
284 rT = re.compile( r'/\s[\w\*+]{1,3}' )
285
286 P = rP.findall( str(PT) )[0]
287 T = rT.findall( str(PT) )[0]
288 rR = re.compile(r'[\w\*+]{1,3}')
289 P = rR.findall( P )[0]
290 T = rR.findall( T )[0]
291
292
293 else:
294 P="NULL"
295 T="NULL"
296
297 print("NAME:"+name)
298 print("cmc:"+cmc)
299 print("mc:"+mc)
300 print("TYPE:"+types)
301 print("CARD TEXT:"+cardText)
302 print("FLAVOR TEXT:"+flavorText)
303 print("EXPANTION:"+expantion)
304 print("CARD NUMBER:"+cardNumber)
305 print("RARITY:"+rarity)
306 print("ARTIST:"+artist)
307 print("POWER:"+P)
308 print("TOUGHNESS:"+T)
309
310
311 con = sqlite3.connect('mtg.db')
312 #con.row_factory = sqlite3.Row
313 cur = con.cursor()
314 cur.execute('INSERT INTO cards VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?);',
315 (int(id),cardNumber,name,mc,int(cmc),types,cardText,flavorText,expantion,rarity,artist,P,T))
316
317 cur.execute("SELECT * FROM cards")
318 rows = cur.fetchall()
319 for row in rows:
320 print(row)
321 con.close()
322
323 print("\nSql stuf DONE")
324'''
325# --- MAIN APP ---
326clear()
327print("Downloading sites...")
328
329time_max = time.time() + 215 #aproximation of downloading time
330
331max = find_max_site_nr() + 1
332
333print("PAGE: 0/" + str(max))
334
335for i in range(0,max):
336 #---show
337 clear()
338 timeC = time_max - time.time()
339 print("Downloading sites...")
340 print("PAGE: " + str(page_nr+1) + "/" + str(max))
341 print("REMAINING TIME: " + str(timeC) + " sec")
342 #---do stuff
343 doRequest(url)
344 get_cards()
345 page_nr += 1
346 url = url_p1 + str(page_nr) + url_p2
347clear()
348print("Downloading complete")
349print("RECORDs COUNT: " + str(mId.__len__()) + "!!!!")
350
351#end
352'''
353get_card_content("442928")