· 6 years ago · Mar 05, 2020, 03:52 AM
1import csv
2import json
3import requests
4import argparse
5from lxml import html
6import apis
7
8
9HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
10
11
12def parse_arguments():
13 ap = argparse.ArgumentParser()
14 #Arguments for tracking scan
15 ap.add_argument("mode", nargs=1)
16 ap.add_argument("-c", "--csv", required=False, help="Input file sites.csv's path.")
17 #Arguments for api scan
18 ap.add_argument("-t", "--tracking", required=False, help="Input file sites-tracking-ids.csv's path.")
19 ap.add_argument("-k", "--key", required=False, help="API key.")
20 args = ap.parse_args()
21
22 mode = args.mode[0]
23 if mode == 'scrape' and args.csv and args.tracking is None:
24 pass
25 elif mode == 'apis' and args.tracking is not None and args.key is not None and args.csv is None:
26 pass
27 else:
28 ap.error("Usage: python scraper.py <scrape || apis> <csv || tracking>")
29 exit(-1)
30 return args
31
32
33def open_sites_csv(path):
34 sites = []
35 with open(path) as csvfile:
36 #skip the 'domain' header
37 next(csvfile)
38 csvreader = csv.reader(csvfile, dialect='excel')
39 for row in csvreader:
40 sites.append(row[1])
41 return sites
42
43
44def scrape_website_for_ids(url):
45 print("Scraping: " + url)
46 nsite = "http://" + url
47 ids = {'quant': '', 'ganal': '', 'pixel': ''}
48 try:
49 page = requests.get(nsite, headers=HEADERS)
50 tree = html.fromstring(page.content)
51
52 quant = tree.xpath("//img[contains(@src, 'pixel.quantserve')]")
53 ganal = tree.xpath("//script[contains(@src, 'googletag')]")
54 pixel = tree.xpath("//img[contains(@src, 'facebook.com/tr?id=')]")
55
56 if len(quant) > 0:
57 for item in quant:
58 attribute = item.attrib['src']
59 index = attribute.index("p-")
60 length = len(item.attrib['src'])
61 ids['quant'] = item.attrib['src'][index:length]
62 if len(ganal) > 0:
63 for item in ganal:
64 attribute = item.attrib['src']
65 index = attribute.index("UA-")
66 length = len(item.attrib['src'])
67 ids['ganal'] = item.attrib['src'][index:length]
68 if len(pixel) > 0:
69 for item in pixel:
70 attribute = item.attrib['src']
71 index = attribute.index("id=")
72 index2 = attribute.index("&ev=")
73 ids['pixel'] = item.attrib['src'][index:index2]
74 except:
75 pass
76 return ids
77
78
79def scrape_ids(sites_path):
80 site_ids = dict()
81 sites_list = open_sites_csv(sites_path)
82 for site in sites_list:
83 ids = scrape_website_for_ids(site)
84 site_ids[site] = ids
85 output_tracking_ids(site_ids)
86
87
88def output_tracking_ids(site_ids):
89 with open('sites-tracking-ids.csv', 'w', newline='') as csvfile:
90 output = csv.writer(csvfile, dialect='excel')
91 output.writerow(["site", "google-analytics", "fb-pixel", "quantserve"])
92 for key in site_ids:
93 output.writerow([key, site_ids[key]['ganal'], site_ids[key]['pixel'], site_ids[key]['quant']])
94 return
95
96
97def open_sites_tracking_ids_csv(path):
98 sites_ids = dict()
99 with open(path) as csvfile:
100 #skip the header
101 next(csvfile)
102 csvreader = csv.reader(csvfile, dialect='excel')
103 for row in csvreader:
104 ids = {'quant': row[3], 'pixel': row[2], 'ganal': row[1]}
105 sites_ids[row[0]] = ids
106 return sites_ids
107
108
109def scan_apis(sites_tracking_csv_path, api_key):
110 sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
111
112 #unique list of analytics ids
113 analytics_ids = set()
114 for site in sites_ids:
115 if "ganal" in sites_ids[site]:
116 tag = sites_ids[site]['ganal']
117 if tag is not None and tag != '':
118 property_index = tag.find("-", 4)
119 gid_without_property = tag
120 if property_index > -1:
121 gid_without_property = tag[0:property_index]
122 analytics_ids.add(gid_without_property)
123
124 json_list = []
125 for gid in analytics_ids:
126 print("Querying for GID: " + gid)
127 json = apis.spy_on_web(tag, api_key)
128 json_list.append(json)
129 output_analytics_json(json_list)
130
131def output_analytics_json(json_list):
132 with open("api-scan-output.txt", 'w') as txtfile:
133 for js in json_list:
134 txtfile.write(json.dumps(js, indent=4, sort_keys=True))
135
136def main():
137 args = parse_arguments()
138 mode = args.mode[0]
139 if mode == 'scrape':
140 scrape_ids(args.csv)
141 elif mode == 'apis':
142 scan_apis(args.tracking, args.key)
143
144
145if __name__ == '__main__':
146 main()