· 5 years ago · Mar 18, 2020, 12:44 PM
1
2
3import bs4
4from requests.exceptions import ConnectionError, ConnectTimeout, ReadTimeout
5from time import time
6import requests
7import os
8import sys
9from threading import Thread
10import re
11import random
12# from core.models import HighCourt
13# from django.core.exceptions import ObjectDoesNotExist
14
15proxies = { 'http': "127.0.0.1:8118" }
16
17
18def pdf_downloader(file_directory, session, file_name, file_link):
19 try:
20
21 if not os.path.exists(file_directory):
22 os.makedirs(file_directory)
23
24 file_location = os.path.realpath(str(file_directory) + str(file_name))
25 print file_link, "FILE LINK"
26 response = session.get(file_link)
27 with open(file_location, "wb") as wf:
28 wf.write(response.content)
29
30 pdf_link = str(file_location)
31 #print pdf_link, "PDFF_LINKK"
32
33 return pdf_link
34
35 except Exception as e:
36 print('Error on line {} '.format(sys.exc_info()[-1].tb_lineno) + str(e))
37
38
39def snake_case(name):
40 name = re.sub(r'([^a-zA-Z/ ])', r'', name.strip())
41 return re.sub(r'[ /]+', r'_', name).lower()
42
43
44def scrap_table_with_th_labels(table_soup):
45 keys = []
46 _dict = {}
47 if table_soup is not None:
48 # keys_block =
49 for i in table_soup.find_all('th'):
50 key = snake_case(i.text.encode('utf-8').strip(''))
51 keys.append(key)
52 _dict[key] = ""
53
54 # print(keys)
55 for j in table_soup.find_all('tr')[1:]:
56 # print(j.text)
57 if 'Orders' in j.text:
58 # print('orders block found..skipped')
59 break
60 td_block = j.find_all('td')
61 # print(size_td_block)
62 for index, k in enumerate(td_block):
63 # print(index)
64 value = k.text.encode('utf-8').replace('\xc2\xa0', '') + ";"
65 _dict[keys[index]] += value
66 # print(_dict)
67 return _dict
68
69
70def scrap_table(table_soup):
71 table_row = table_soup.tr
72 table_headers_dict = [snake_case(order.text) for order in table_row.select('td, th')]
73
74 orders = []
75 data_detail = {header: '' for header in table_headers_dict}
76
77 try:
78 while table_row.next_sibling:
79 table_row = table_row.next_sibling
80 this_data = []
81 for data in table_row:
82 if type(data) == bs4.Tag:
83 if data.a:
84 this_data.append('https://services.ecourts.gov.in/ecourtindiaHC/cases/' + data.a['href'])
85 elif data.name == 'td':
86 try:
87 this_data.append(data.string.strip().upper())
88 except AttributeError:
89 this_data.append('')
90
91 if this_data:
92 orders.append(this_data)
93 except TypeError:
94 pass
95
96 for i in range(len(orders)):
97 for ii in range(len(table_headers_dict)):
98 data_detail[table_headers_dict[ii]] += str(orders[i][ii]) + ';'
99
100 return data_detail
101
102
103def scrap_details(case_id, session, soup, user_id, case_det):
104 info_dict = {}
105 document_details_dict = {}
106 category_details_dict = {}
107 objections_dict = {}
108 counter = 1
109 last_counter = ''
110
111 scraping_start_time = time()
112 for span in soup.select('span.case_details_table'):
113 for child in span.children:
114
115 if child.string:
116 if counter % 2 == 1:
117 label = snake_case(child.string)
118 info_dict[label] = ''
119 last_counter = label
120 else:
121 info_dict[last_counter] = child.string[2:].upper().strip()
122
123 counter += 1
124 elif type(child) == bs4.Tag:
125
126 key, value = tuple(child.text.split(':'))
127 info_dict[snake_case(key)] = value.strip().upper()
128
129 for span in soup.find('h2', text='Case Status').next_sibling.select('span'):
130 counter = 1
131 last_counter = ''
132
133 for strong in span.select('strong'):
134 if counter % 2 == 1:
135 label = snake_case(strong.string)
136 info_dict[label] = ''
137 last_counter = label
138 else:
139 info_dict[last_counter] = strong.string[2:].upper().strip()
140
141 counter += 1
142 # print("info_dict is below:")
143 # print(info_dict)
144
145 petitioner_block = soup.find('span', {'class': 'Petitioner_Advocate_table'})
146 respondent_block = soup.find('span', {'class': 'Respondent_Advocate_table'})
147 party_dict = dict()
148 party_dict['petitioner_and_advocate'] = petitioner_block.text.encode('utf-8').replace('\xc2\xa0', '')
149 party_dict['respondent_and_advocate'] = respondent_block.text.encode('utf-8').replace('\xc2\xa0', '')
150 # print('party_dict: ')
151 # print(party_dict)
152 acts_table_soup = soup.find('table', {"id": "act_table"})
153 # print(acts_table_soup)
154 acts_dict = {}
155 if acts_table_soup is not None:
156 acts_dict = scrap_table_with_th_labels(acts_table_soup)
157 # print ("acts_dict ")
158 # print(acts_dict)
159 else:
160 print('No acts table')
161
162 linked_cases_table_soup = soup.find('table', {"class": "linkedCase"})
163 # print(acts_table_soup)
164 linked_cases_dict = {}
165 if linked_cases_table_soup is not None:
166 linked_cases_dict = scrap_table_with_th_labels(linked_cases_table_soup)
167 #print ("linked_cases_dict " + str(linked_cases_dict))
168 else:
169 print('No linked cases table')
170
171 case_history_table_soup = soup.find('table', {"class": "history_table"})
172 # print(case_history_table_soup)
173 case_history_dict = {}
174 if case_history_table_soup is not None:
175 case_history_dict = scrap_table_with_th_labels(case_history_table_soup)
176 #print ("case_history_dict ")
177 #print(case_history_dict)
178 else:
179 print("No case history table")
180
181 sub_matters_block = soup.find("table", {"class": "MainCase"})
182 submatters_dict = {}
183 if sub_matters_block is not None:
184 submatters_dict['case_number'] = sub_matters_block.find_all('td')[1].text\
185 .encode('utf-8').replace('\xc2\xa0', '')
186 # print('submatters_dict ')
187 # print(submatters_dict)
188
189 subordinate_court_info_dict = {}
190 keys = []
191 values = []
192 subordinate_court_block = soup.find('span', {'class': 'Lower_court_table'})
193 if subordinate_court_block is not None:
194 for key in subordinate_court_block.find_all('span'):
195 keys.append(snake_case(key.text.encode('utf-8')))
196 for value in subordinate_court_block.find_all('label'):
197 values.append(value.text.encode('utf-8').replace('\xc2\xa0', ''))
198 # print (keys, values)
199 subordinate_court_info_dict = dict(zip(keys, values))
200 keys = []
201 values = []
202 # print("subordinate_court_info_dict is below:")
203 # print (subordinate_court_info_dict)
204
205 orders_dict = {}
206
207
208 for orderheading in soup.select("#orderheading"):
209
210 table_name = orderheading.text.lower().strip()
211 sibling = orderheading.next_sibling
212
213 if table_name == 'orders':
214 order_detail = scrap_table(sibling)
215 # print(order_detail)
216 download_threads = []
217 links = order_detail['order_details'].split(';')[:-1]
218 order_detail['order_links'] = []
219 _dir = 'orders/' + case_id + '/'
220 print "total orders: " + str(len(links))
221 for i in range(len(links)):
222 download_threads.append(
223 Thread(target=pdf_downloader, args=(_dir, session, str(i + 1)
224 + ".pdf", links[i])))
225 file_path = _dir + str(i + 1) + ".pdf"
226 order_detail['order_links'].append(file_path)
227
228 for thread in download_threads:
229 thread.start()
230 order_detail['order_links'] = '~'.join(order_detail['order_links'])
231 #print download_threads, "DOWNLOAD THREADSSS"
232 #print links, "LINKSSSSSSSSSSSSSSS"
233 orders_dict = order_detail
234
235 if table_name == 'objection':
236 # print("objection_details"+str(scrap_details(sibling)))
237 objections_dict = scrap_table(sibling)
238 # print (objections_dict)
239 # print ("")
240
241 if table_name == 'document details':
242 document_details_dict = scrap_table(sibling)
243 if table_name == 'category details':
244 category_details_dict['Category'] = sibling.find_all('td')[1].text
245 # print('orders_dict: ')
246 # print(orders_dict)
247
248 # print('document_details_dict: ')
249 # print (document_details_dict)
250 # print('category_details_dict:')
251 # print(category_details_dict)
252 # print('objections_dict:')
253 # print(objections_dict)
254
255 courts__ = {'24-1-1': 'Sikkim', '18-1-1': 'Chhattisgarh', '12-1-1': 'Jammu & Kashmir, Jammu', '2-1-1': 'Hyderabad',
256 '21-1-1': 'Meghalaya', '17-1-1': 'Gujarat', '15-1-1': 'Uttarakhand', '20-1-1': 'Tripura',
257 '9-1-1': 'Rajasthan, Jaipur', '11-1-1': 'Orissa', '9-2-1': 'Rajasthan, Jodhpur', '4-1-1': 'Kerala',
258 '7-1-1': 'Jharkhand', '10-1-1': 'Madras', '12-2-1': 'Jammu & Kashmir, Srinagar', '25-1-1': 'Manipur',
259 '13-1-1': 'Allahabad', '5-1-1': 'Himachal Pradesh', '6-1-1': 'Gauhati'}
260
261 case_det['court_name'] = courts__[case_det['state_code'] + '-' + case_det['court_code'] + '-' + case_det['dist_code']] + " High Court"
262
263
264
265 details = {
266 'case': case_det,
267 'info': info_dict,
268 'party': party_dict,
269 'acts_dict': acts_dict,
270 'linked_cases': linked_cases_dict,
271 'case_history': case_history_dict,
272 'submatters': submatters_dict,
273 'subordinate_court_info': subordinate_court_info_dict,
274 'orders': orders_dict,
275 'document_details': document_details_dict,
276 'category_details': category_details_dict,
277 'objections': objections_dict,
278 }
279
280 print(details)
281
282 mapping = {
283 'case': {
284 'case_year': 'case_year',
285 'case_number': 'case_number',
286 'case_type_code': 'case_type_code',
287 'state_code': 'state_code',
288 'court_code': 'court_code',
289 'dist_code': 'dist_code',
290 'court_name': 'court_name'
291 },
292 'info': {
293 'filing_number': 'filing_number',
294 'first_hearing_date': 'first_hearing_date',
295 'next_hearing_date': 'next_hearing_date',
296 'cnr_number': 'cnr_number',
297 'nature_of_disposal': 'nature_of_disposal',
298 'causelist_name': 'cause_list_name',
299 'coram': 'coram',
300 'judicial': 'judicial',
301 'registration_date': 'registration_date',
302 'case_status': 'case_status',
303 'bench': 'bench',
304 'filing_date': 'filing_date',
305 'case_type': 'case_type',
306 'decision_date': 'decision_date',
307 'registration_number': 'registration_number',
308 },
309 'party': {
310 'respondent_and_advocate': 'respondent',
311 'petitioner_and_advocate': 'petitioner'
312 },
313 'acts_dict': {
314 'under_acts': 'under_act',
315 'under_sections': 'under_section',
316 },
317 'linked_cases': {
318 'filing_number': 'linked_case_filing_numbers',
319 'case_number': 'linked_case_numbers'
320 },
321 'case_history': {
322 'judge': 'case_history_judges',
323 'business_on_date': 'case_history_business_on_date',
324 'purpose_of_hearing': 'case_history_purpose_of_hearing',
325 'hearing_date': 'case_history_hearing_date'
326 },
327 'submatters': {
328 'case_number': 'submatters_case'
329 },
330 'subordinate_court_info': {
331 'district': 'subordinate_court_district',
332 'state': 'subordinate_court_state',
333 'court_number_and_name': 'subordinate_court_name_number',
334 'case_number_and_year': 'subordinate_court_case_number_year',
335 'case_decision_date_': 'subordinate_court_decision_date'
336 },
337 'orders': {
338 'judge': 'order_judges',
339 'order_date': 'order_dates',
340 'order_details': 'order_links'
341 },
342 'document_details': {
343 'date_of_receiving': 'document_date_recieving',
344 'document_no': 'document_number',
345 'name_of_advocate': 'document_advocate_name',
346 'filed_by': 'document_filed_by',
347
348 },
349 'category_details': {
350 'category': 'case_category'
351 },
352 'objections': {
353 'objection': 'objection',
354 'scrutiny_date': 'scrutiny_date',
355 'receipt_date': 'objection_reciept_date',
356 'compliance_date': 'objection_compliance_date'
357 }
358 }
359
360 _main_dict = dict()
361 message = ''
362
363 for _dict_name in mapping:
364 for _dict_key, _model_field in mapping[_dict_name].items():
365 try:
366 _main_dict[_model_field] = details[_dict_name][_dict_key]
367 except KeyError:
368 pass
369
370 try:
371 pass
372 # try:
373 # case_obj = HighCourt.objects.get(**case_det)
374 # message += 'Case found in db, '
375
376 # next_hearing_date = info_dict.get("next_hearing_date", None)
377 # print("next_hearing_date")
378 # print(next_hearing_date, type(next_hearing_date))
379 # print("old wala")
380 # print(case_obj.next_hearing_date)
381
382 # if case_obj.next_hearing_date != next_hearing_date:
383
384 # success_flag = 2
385
386 # print("next hearing date updated")
387 # message += 'Next hearing date updated, '
388
389 # # case updation sms and email to user
390 # if user_id is not None:
391 # case_updation_text = 'Case Update Alert!' + '\n' + 'Case No.: ' + str(
392 # case_det['case_number']) + ' Case Type: ' + info_dict['case_type'] + ' Year: ' + str(
393 # case_det['case_year']) \
394 # + ' has been reupdated'
395 # case_updation_text = '"' + case_updation_text + '"'
396 # # send_sms(case_updation_text, mobile_num)
397 # message += 'Message sent to user'
398
399 # for key in _main_dict:
400 # setattr(case_obj, key, _main_dict[key])
401
402 # print('this HighCourt case exists in db..So updating the case')
403 # case_obj.save()
404 # print('case updated successfully')
405 # message += 'Case updated'
406
407 # success_flag = 1
408
409 # except ObjectDoesNotExist:
410 # case_obj = HighCourt(**_main_dict)
411 # case_obj.save()
412
413 # success_flag = 1
414
415 # print('case added successfully')
416 # message += 'Case added successfully, '
417 # success_flag = 1
418
419 # if user_id is not None:
420 # case_obj.new_user.add(user_id)
421 # case_obj.save()
422 # message += 'Added user to case'
423
424 # success_flag = 1
425
426 except Exception as e:
427 print(str(e))
428 print("failed to create HighCourt object in db")
429 return 0, details, str(e)
430
431 # category_block = soup.find('table', {"id":"orderheading"})
432 # print(category_block.next_sibling)
433
434 print('Scraping Time:', time() - scraping_start_time)
435
436 return 1, details, message
437
438
439def main(state_code, dist_code, court_code, case_type, case_no, case_year, user_id=None, *args):
440 state_code = str(state_code)
441 dist_code = str(dist_code)
442 court_code = str(court_code)
443 case_id = '2' + str(case_type).zfill(3) + str(case_no).zfill(7) + str(case_year)
444
445 case_id = str(case_id)
446
447 print('\n')
448 print("State Code: " + state_code)
449 print("Court Code: " + court_code)
450 print("Dist Code: " + dist_code)
451 print("Case ID: " + case_id)
452 print('\n')
453
454 url = 'https://services.ecourts.gov.in/ecourtindiaHC/cases/o_civil_case_history.php'
455 case_data = {
456 'state_code': str(state_code),
457 'dist_code': str(dist_code),
458 'court_code': str(court_code),
459 'case_no': str(case_id)
460 }
461
462 case = {
463 'state_code': state_code,
464 'dist_code': dist_code,
465 'court_code': court_code,
466 'case_type_code': case_type,
467 'case_number': case_no,
468 'case_year': case_year
469 }
470
471 proxies = {"http": "127.0.0.1:8118", "https":"127.0.0.1:8118"}
472
473 user_agent_list = [
474 #Chrome
475 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
476 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
477 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
478 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
479 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
480 #Firefox
481 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
482 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
483 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
484 ]
485
486 header = random.choice(user_agent_list)
487 header = {'user-agent':header}
488
489 try:
490 api_start_time = time()
491 session = requests.session()
492 session.proxies.update(proxies=proxies)
493
494 print (url, case_data)
495 doc = session.post(url, data=case_data, timeout=50, proxies=proxies, headers=header)
496 print(doc.status_code)
497
498 print("API Time:", time() - api_start_time)
499
500 if doc.status_code != 200 or doc.text.lower() == 'error':
501 #print (doc.content)
502 return 0, {}, 'Status code not 200 or ERROR'
503
504 except (ConnectionError, ConnectTimeout):
505 error = 'Error Connecting Server'
506 print(error)
507 return -1, {}, error
508 except ReadTimeout:
509 error = 'No Data Received from Server. Probably CASE DO NOT EXIST'
510 print(error)
511 return -1, {}, error
512
513 soup = bs4.BeautifulSoup(doc.text, 'lxml')
514 text = doc.text.encode('utf-8').strip(' ')
515 #print(doc.text, '', text)
516
517 if '\xef\xbb\xbfERROR' == text:
518 return -1, {}, 'Case not found'
519
520 return scrap_details(case_id, session, soup, user_id, case)
521
522main(9, 1, 1, 30, 400, 2018)