· 6 years ago · Oct 08, 2019, 10:40 PM
1from datetime import date, datetime, timedelta
2from statistics import mean
3
4from scrapinghub import ScrapinghubClient
5
6
7client = ScrapinghubClient('--- insira api key aqui ---')
8
9
10def filter_jobs_by_date(job_data, spider):
11 today = date.today()
12 seven_days_later = today - timedelta(days=7)
13 job = spider.jobs.get(job_data['key'])
14 timestamp = job.metadata.get('scrapystats')['start_time'] / 1000.0
15 job_start_datetime = datetime.fromtimestamp(timestamp)
16 return seven_days_later <= job_start_datetime.date() <= today
17
18
19def get_post_process_id_and_project(log):
20 _msg_split = log['message'].split('Postprocess job scheduled: ')
21 job_url = _msg_split[1].split(' (')[0]
22
23 job_id = job_url.split('p/')[1]
24 project_id = job_id.split('/', maxsplit=1)[0]
25
26 return project_id, job_id
27
28
29def get_post_process_info(logs):
30 _filter_logs = lambda x: "Postprocess job scheduled:" in x['message']
31 post_process_log = list(filter(_filter_logs, logs))
32
33 if post_process_log:
34 project_id, job_id = get_post_process_id_and_project(post_process_log[0])
35 project = client.get_project(project_id)
36 job = project.jobs.get(job_id)
37 stats = job.metadata.get('scrapystats')
38
39 if stats:
40 styles_added = stats.get('mongo/ratio/styles/added')
41 styles_moved = stats.get('mongo/ratio/styles/moved', 0)
42 products_added = stats.get('mongo/ratio/products/added', 0)
43
44 return {
45 'styles_added': styles_added,
46 'styles_moved': styles_moved,
47 'products_added': products_added
48 }
49
50 return None
51
52
53def get_job_stats(project_id, spider_name):
54
55 styles_added_list = []
56 styles_moved_list = []
57 products_added_list = []
58
59 project = client.get_project(project_id)
60 spider = project.spiders.get(spider_name)
61
62 jobs = spider.jobs.list()
63 jobs = sorted(jobs, key=lambda x: x['key'])
64 jobs = list(filter(lambda job: job['close_reason'] == 'finished', jobs))
65 jobs = filter(lambda job: filter_jobs_by_date(job, spider), jobs)
66
67 for job in jobs:
68 job = spider.jobs.get(job['key'])
69 stats = get_post_process_info(job.logs.list())
70 if stats:
71 styles_added_list.append(stats['styles_added'])
72 styles_moved_list.append(stats['styles_moved'])
73 products_added_list.append(stats['products_added'])
74
75 styles_added_mean = mean(styles_added_list)
76 styles_moved_mean = mean(styles_moved_list)
77 products_added_mean = mean(products_added_list)
78
79 return {
80 'styles/added': styles_added_mean,
81 'styles/moved': styles_moved_mean,
82 'products/added': products_added_mean
83 }
84
85
86if __name__ == '__main__':
87 spider_name = '--- spider name ---'
88 project_id = '--- spider project id ---'
89
90 stats = get_job_stats(project_id, spider_name)
91 print(stats)