gDr9MEz5

· 5 years ago · Mar 18, 2020, 12:44 PM
1
2
3import bs4
4from requests.exceptions import ConnectionError, ConnectTimeout, ReadTimeout
5from time import time
6import requests
7import os
8import sys
9from threading import Thread
10import re
11import random
12# from core.models import HighCourt
13# from django.core.exceptions import ObjectDoesNotExist
14
15proxies = { 'http': "127.0.0.1:8118" }
16
17
18def pdf_downloader(file_directory, session, file_name, file_link):
19    try:
20
21        if not os.path.exists(file_directory):
22            os.makedirs(file_directory)
23
24        file_location = os.path.realpath(str(file_directory) + str(file_name))
25        print file_link, "FILE LINK"
26        response = session.get(file_link)
27        with open(file_location, "wb") as wf:
28            wf.write(response.content)
29
30        pdf_link = str(file_location)
31        #print pdf_link, "PDFF_LINKK"
32
33        return pdf_link
34
35    except Exception as e:
36        print('Error on line {}  '.format(sys.exc_info()[-1].tb_lineno) + str(e))
37
38
39def snake_case(name):
40    name = re.sub(r'([^a-zA-Z/ ])', r'', name.strip())
41    return re.sub(r'[ /]+', r'_', name).lower()
42
43
44def scrap_table_with_th_labels(table_soup):
45    keys = []
46    _dict = {}
47    if table_soup is not None:
48        # keys_block =
49        for i in table_soup.find_all('th'):
50            key = snake_case(i.text.encode('utf-8').strip(''))
51            keys.append(key)
52            _dict[key] = ""
53
54        # print(keys)
55        for j in table_soup.find_all('tr')[1:]:
56            # print(j.text)
57            if 'Orders' in j.text:
58                # print('orders block found..skipped')
59                break
60            td_block = j.find_all('td')
61            # print(size_td_block)
62            for index, k in enumerate(td_block):
63                # print(index)
64                value = k.text.encode('utf-8').replace('\xc2\xa0', '') + ";"
65                _dict[keys[index]] += value
66        # print(_dict)
67        return _dict
68
69
70def scrap_table(table_soup):
71    table_row = table_soup.tr
72    table_headers_dict = [snake_case(order.text) for order in table_row.select('td, th')]
73
74    orders = []
75    data_detail = {header: '' for header in table_headers_dict}
76
77    try:
78        while table_row.next_sibling:
79            table_row = table_row.next_sibling
80            this_data = []
81            for data in table_row:
82                if type(data) == bs4.Tag:
83                    if data.a:
84                        this_data.append('https://services.ecourts.gov.in/ecourtindiaHC/cases/' + data.a['href'])
85                    elif data.name == 'td':
86                        try:
87                            this_data.append(data.string.strip().upper())
88                        except AttributeError:
89                            this_data.append('')
90
91            if this_data:
92                orders.append(this_data)
93    except TypeError:
94        pass
95
96    for i in range(len(orders)):
97        for ii in range(len(table_headers_dict)):
98            data_detail[table_headers_dict[ii]] += str(orders[i][ii]) + ';'
99
100    return data_detail
101
102
103def scrap_details(case_id, session, soup, user_id, case_det):
104    info_dict = {}
105    document_details_dict = {}
106    category_details_dict = {}
107    objections_dict = {}
108    counter = 1
109    last_counter = ''
110
111    scraping_start_time = time()
112    for span in soup.select('span.case_details_table'):
113        for child in span.children:
114
115            if child.string:
116                if counter % 2 == 1:
117                    label = snake_case(child.string)
118                    info_dict[label] = ''
119                    last_counter = label
120                else:
121                    info_dict[last_counter] = child.string[2:].upper().strip()
122
123                counter += 1
124            elif type(child) == bs4.Tag:
125
126                key, value = tuple(child.text.split(':'))
127                info_dict[snake_case(key)] = value.strip().upper()
128
129    for span in soup.find('h2', text='Case Status').next_sibling.select('span'):
130        counter = 1
131        last_counter = ''
132
133        for strong in span.select('strong'):
134            if counter % 2 == 1:
135                label = snake_case(strong.string)
136                info_dict[label] = ''
137                last_counter = label
138            else:
139                info_dict[last_counter] = strong.string[2:].upper().strip()
140
141            counter += 1
142    # print("info_dict is below:")
143    # print(info_dict)
144
145    petitioner_block = soup.find('span', {'class': 'Petitioner_Advocate_table'})
146    respondent_block = soup.find('span', {'class': 'Respondent_Advocate_table'})
147    party_dict = dict()
148    party_dict['petitioner_and_advocate'] = petitioner_block.text.encode('utf-8').replace('\xc2\xa0', '')
149    party_dict['respondent_and_advocate'] = respondent_block.text.encode('utf-8').replace('\xc2\xa0', '')
150    # print('party_dict: ')
151    # print(party_dict)
152    acts_table_soup = soup.find('table', {"id": "act_table"})
153    # print(acts_table_soup)
154    acts_dict = {}
155    if acts_table_soup is not None:
156        acts_dict = scrap_table_with_th_labels(acts_table_soup)
157        # print ("acts_dict ")
158        # print(acts_dict)
159    else:
160        print('No acts table')
161
162    linked_cases_table_soup = soup.find('table', {"class": "linkedCase"})
163    # print(acts_table_soup)
164    linked_cases_dict = {}
165    if linked_cases_table_soup is not None:
166        linked_cases_dict = scrap_table_with_th_labels(linked_cases_table_soup)
167        #print ("linked_cases_dict " + str(linked_cases_dict))
168    else:
169        print('No linked cases table')
170
171    case_history_table_soup = soup.find('table', {"class": "history_table"})
172    # print(case_history_table_soup)
173    case_history_dict = {}
174    if case_history_table_soup is not None:
175        case_history_dict = scrap_table_with_th_labels(case_history_table_soup)
176        #print ("case_history_dict ")
177        #print(case_history_dict)
178    else:
179        print("No case history table")
180
181    sub_matters_block = soup.find("table", {"class": "MainCase"})
182    submatters_dict = {}
183    if sub_matters_block is not None:
184        submatters_dict['case_number'] = sub_matters_block.find_all('td')[1].text\
185            .encode('utf-8').replace('\xc2\xa0', '')
186    # print('submatters_dict ')
187    # print(submatters_dict)
188
189    subordinate_court_info_dict = {}
190    keys = []
191    values = []
192    subordinate_court_block = soup.find('span', {'class': 'Lower_court_table'})
193    if subordinate_court_block is not None:
194        for key in subordinate_court_block.find_all('span'):
195            keys.append(snake_case(key.text.encode('utf-8')))
196        for value in subordinate_court_block.find_all('label'):
197            values.append(value.text.encode('utf-8').replace('\xc2\xa0', ''))
198        # print (keys, values)
199        subordinate_court_info_dict = dict(zip(keys, values))
200        keys = []
201        values = []
202    # print("subordinate_court_info_dict is below:")
203    # print (subordinate_court_info_dict)
204
205    orders_dict = {}
206    
207
208    for orderheading in soup.select("#orderheading"):
209        
210        table_name = orderheading.text.lower().strip()
211        sibling = orderheading.next_sibling
212
213        if table_name == 'orders':
214            order_detail = scrap_table(sibling)
215            # print(order_detail)
216            download_threads = []
217            links = order_detail['order_details'].split(';')[:-1]
218            order_detail['order_links'] = []
219            _dir = 'orders/' + case_id + '/'
220            print "total orders: " + str(len(links))
221            for i in range(len(links)):
222                download_threads.append(
223                    Thread(target=pdf_downloader, args=(_dir, session, str(i + 1)
224                                                        + ".pdf", links[i])))
225                file_path = _dir + str(i + 1) + ".pdf"
226                order_detail['order_links'].append(file_path)
227
228            for thread in download_threads:
229                thread.start()
230            order_detail['order_links'] = '~'.join(order_detail['order_links'])
231        #print download_threads, "DOWNLOAD THREADSSS"
232        #print links, "LINKSSSSSSSSSSSSSSS"
233            orders_dict = order_detail  
234
235        if table_name == 'objection':
236            # print("objection_details"+str(scrap_details(sibling)))
237            objections_dict = scrap_table(sibling)
238        # print (objections_dict)
239        # print ("")
240
241        if table_name == 'document details':
242            document_details_dict = scrap_table(sibling)
243        if table_name == 'category details':
244            category_details_dict['Category'] = sibling.find_all('td')[1].text
245    # print('orders_dict: ')
246    # print(orders_dict)
247
248    # print('document_details_dict: ')
249    # print (document_details_dict)
250    # print('category_details_dict:')
251    # print(category_details_dict)
252    # print('objections_dict:')
253    # print(objections_dict)
254
255    courts__ = {'24-1-1': 'Sikkim', '18-1-1': 'Chhattisgarh', '12-1-1': 'Jammu & Kashmir, Jammu', '2-1-1': 'Hyderabad',
256     '21-1-1': 'Meghalaya', '17-1-1': 'Gujarat', '15-1-1': 'Uttarakhand', '20-1-1': 'Tripura',
257     '9-1-1': 'Rajasthan, Jaipur', '11-1-1': 'Orissa', '9-2-1': 'Rajasthan, Jodhpur', '4-1-1': 'Kerala',
258     '7-1-1': 'Jharkhand', '10-1-1': 'Madras', '12-2-1': 'Jammu & Kashmir, Srinagar', '25-1-1': 'Manipur',
259     '13-1-1': 'Allahabad', '5-1-1': 'Himachal Pradesh', '6-1-1': 'Gauhati'}
260    
261    case_det['court_name'] = courts__[case_det['state_code'] + '-' + case_det['court_code'] + '-' + case_det['dist_code']] + " High Court"
262    
263    
264
265    details = {
266        'case': case_det,
267        'info': info_dict,
268        'party': party_dict,
269        'acts_dict': acts_dict,
270        'linked_cases': linked_cases_dict,
271        'case_history': case_history_dict,
272        'submatters': submatters_dict,
273        'subordinate_court_info': subordinate_court_info_dict,
274        'orders': orders_dict,
275        'document_details': document_details_dict,
276        'category_details': category_details_dict,
277        'objections': objections_dict,
278    }
279
280    print(details)
281
282    mapping = {
283        'case': {
284            'case_year': 'case_year',
285            'case_number': 'case_number',
286            'case_type_code': 'case_type_code',
287            'state_code': 'state_code',
288            'court_code': 'court_code',
289            'dist_code': 'dist_code',
290            'court_name': 'court_name'
291        },
292        'info': {
293            'filing_number': 'filing_number',
294            'first_hearing_date': 'first_hearing_date',
295            'next_hearing_date': 'next_hearing_date',
296            'cnr_number': 'cnr_number',
297            'nature_of_disposal': 'nature_of_disposal',
298            'causelist_name': 'cause_list_name',
299            'coram': 'coram',
300            'judicial': 'judicial',
301            'registration_date': 'registration_date',
302            'case_status': 'case_status',
303            'bench': 'bench',
304            'filing_date': 'filing_date',
305            'case_type': 'case_type',
306            'decision_date': 'decision_date',
307            'registration_number': 'registration_number',
308        },
309        'party': {
310            'respondent_and_advocate': 'respondent',
311            'petitioner_and_advocate': 'petitioner'
312        },
313        'acts_dict': {
314            'under_acts': 'under_act',
315            'under_sections': 'under_section',
316        },
317        'linked_cases': {
318            'filing_number': 'linked_case_filing_numbers',
319            'case_number': 'linked_case_numbers'
320        },
321        'case_history': {
322            'judge': 'case_history_judges',
323            'business_on_date': 'case_history_business_on_date',
324            'purpose_of_hearing': 'case_history_purpose_of_hearing',
325            'hearing_date': 'case_history_hearing_date'
326        },
327        'submatters': {
328            'case_number': 'submatters_case'
329        },
330        'subordinate_court_info': {
331            'district': 'subordinate_court_district',
332            'state': 'subordinate_court_state',
333            'court_number_and_name': 'subordinate_court_name_number',
334            'case_number_and_year': 'subordinate_court_case_number_year',
335            'case_decision_date_': 'subordinate_court_decision_date'
336        },
337        'orders': {
338            'judge': 'order_judges',
339            'order_date': 'order_dates',
340            'order_details': 'order_links'
341        },
342        'document_details': {
343            'date_of_receiving': 'document_date_recieving',
344            'document_no': 'document_number',
345            'name_of_advocate': 'document_advocate_name',
346            'filed_by': 'document_filed_by',
347
348        },
349        'category_details': {
350            'category': 'case_category'
351        },
352        'objections': {
353            'objection': 'objection',
354            'scrutiny_date': 'scrutiny_date',
355            'receipt_date': 'objection_reciept_date',
356            'compliance_date': 'objection_compliance_date'
357        }
358    }
359
360    _main_dict = dict()
361    message = ''
362
363    for _dict_name in mapping:
364        for _dict_key, _model_field in mapping[_dict_name].items():
365            try:
366                _main_dict[_model_field] = details[_dict_name][_dict_key]
367            except KeyError:
368                pass
369
370    try:
371        pass
372        # try:
373        #     case_obj = HighCourt.objects.get(**case_det)
374        #     message += 'Case found in db, '
375
376        #     next_hearing_date = info_dict.get("next_hearing_date", None)
377        #     print("next_hearing_date")
378        #     print(next_hearing_date, type(next_hearing_date))
379        #     print("old wala")
380        #     print(case_obj.next_hearing_date)
381
382        #     if case_obj.next_hearing_date != next_hearing_date:
383
384        #         success_flag = 2
385
386        #         print("next hearing date updated")
387        #         message += 'Next hearing date updated, '
388
389        #         # case updation sms and email to user
390        #         if user_id is not None:
391        #             case_updation_text = 'Case Update Alert!' + '\n' + 'Case No.: ' + str(
392        #                 case_det['case_number']) + ' Case Type: ' + info_dict['case_type'] + ' Year: ' + str(
393        #                 case_det['case_year']) \
394        #                                  + ' has been reupdated'
395        #             case_updation_text = '"' + case_updation_text + '"'
396        #             # send_sms(case_updation_text, mobile_num)
397        #             message += 'Message sent to user'
398
399        #     for key in _main_dict:
400        #         setattr(case_obj, key, _main_dict[key])
401
402        #     print('this HighCourt case exists in db..So updating the case')
403        #     case_obj.save()
404        #     print('case updated successfully')
405        #     message += 'Case updated'
406
407        #     success_flag = 1
408
409        # except ObjectDoesNotExist:
410        #     case_obj = HighCourt(**_main_dict)
411        #     case_obj.save()
412
413        #     success_flag = 1
414
415        #     print('case added successfully')
416        #     message += 'Case added successfully, '
417        #     success_flag = 1
418
419        # if user_id is not None:
420        #     case_obj.new_user.add(user_id)
421        #     case_obj.save()
422        #     message += 'Added user to case'
423
424                # success_flag = 1
425
426    except Exception as e:
427        print(str(e))
428        print("failed to create HighCourt object in db")
429        return 0, details, str(e)
430
431    # category_block = soup.find('table', {"id":"orderheading"})
432    # print(category_block.next_sibling)
433
434    print('Scraping Time:', time() - scraping_start_time)
435
436    return 1, details, message
437
438
439def main(state_code, dist_code, court_code, case_type, case_no, case_year, user_id=None, *args):
440    state_code = str(state_code)
441    dist_code = str(dist_code)
442    court_code = str(court_code)
443    case_id = '2' + str(case_type).zfill(3) + str(case_no).zfill(7) + str(case_year)
444
445    case_id = str(case_id)
446
447    print('\n')
448    print("State Code: " + state_code)
449    print("Court Code: " + court_code)
450    print("Dist Code: " + dist_code)
451    print("Case ID: " + case_id)
452    print('\n')
453
454    url = 'https://services.ecourts.gov.in/ecourtindiaHC/cases/o_civil_case_history.php'
455    case_data = {
456        'state_code': str(state_code),
457        'dist_code': str(dist_code),
458        'court_code': str(court_code),
459        'case_no': str(case_id)
460    }
461
462    case = {
463        'state_code': state_code,
464        'dist_code': dist_code,
465        'court_code': court_code,
466        'case_type_code': case_type,
467        'case_number': case_no,
468        'case_year': case_year
469    }
470   
471    proxies =  {"http": "127.0.0.1:8118", "https":"127.0.0.1:8118"}
472
473    user_agent_list = [
474     #Chrome
475    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
476    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
477    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
478    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
479    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
480    #Firefox
481    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
482    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
483    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
484    ]
485    
486    header = random.choice(user_agent_list)
487    header = {'user-agent':header}
488    
489    try:
490        api_start_time = time()
491        session = requests.session()
492        session.proxies.update(proxies=proxies)
493
494        print (url, case_data)
495        doc = session.post(url, data=case_data, timeout=50, proxies=proxies, headers=header)
496        print(doc.status_code)
497
498        print("API Time:", time() - api_start_time)
499
500        if doc.status_code != 200 or doc.text.lower() == 'error':
501            #print (doc.content)
502            return 0, {}, 'Status code not 200 or ERROR'
503
504    except (ConnectionError, ConnectTimeout):
505        error = 'Error Connecting Server'
506        print(error)
507        return -1, {}, error
508    except ReadTimeout:
509        error = 'No Data Received from Server. Probably CASE DO NOT EXIST'
510        print(error)
511        return -1, {}, error
512
513    soup = bs4.BeautifulSoup(doc.text, 'lxml')
514    text = doc.text.encode('utf-8').strip(' ')
515    #print(doc.text, '', text)
516
517    if '\xef\xbb\xbfERROR' == text:
518        return -1, {}, 'Case not found'
519
520    return scrap_details(case_id, session, soup, user_id, case)
521
522main(9, 1, 1, 30, 400, 2018)