· 6 years ago · Mar 09, 2020, 02:14 AM
1import csv
2import json
3import requests
4import argparse
5from lxml import html
6import apis
7from urllib.parse import urlparse
8import subprocess
9
10
11HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
12
13
14def parse_arguments():
15 ap = argparse.ArgumentParser()
16 #Arguments for tracking scan
17 ap.add_argument("mode", nargs=1)
18 ap.add_argument("-c", "--csv", required=False, help="Input file sites.csv's path.")
19 #Arguments for api scan
20 ap.add_argument("-t", "--tracking", required=False, help="Input file sites-tracking-ids.csv's path.")
21 ap.add_argument("-k", "--key", required=False, help="API key.")
22
23 args = ap.parse_args()
24
25 mode = args.mode[0]
26 if mode == 'unique':
27 pass
28 elif mode == 'publicwww':
29 pass
30 elif mode == 'uniquewww':
31 pass
32 elif mode == 'lynx':
33 pass
34 elif mode == 'lynxcount':
35 pass
36 elif mode == 'scrape' and args.csv and args.tracking is None:
37 pass
38 elif mode == 'apis' and args.tracking is not None and args.key is not None and args.csv is None:
39 pass
40 else:
41 ap.error("Usage: python scraper.py <scrape || apis> <csv || tracking>")
42 exit(-1)
43
44 return args
45
46
47def open_sites_csv(path):
48 sites = []
49 with open(path) as csvfile:
50 #skip the 'domain' header
51 next(csvfile)
52 csvreader = csv.reader(csvfile, dialect='excel')
53 for row in csvreader:
54 sites.append(row[1])
55 return sites
56
57
58def scrape_website_for_ids(url):
59 print("Scraping: " + url)
60 nsite = "http://" + url
61 ids = {'quant': '', 'ganal': '', 'pixel': ''}
62 try:
63 page = requests.get(nsite, headers=HEADERS)
64 tree = html.fromstring(page.content)
65
66 quant = tree.xpath("//img[contains(@src, 'pixel.quantserve')]")
67 ganal = tree.xpath("//script[contains(@src, 'googletag')]")
68 pixel = tree.xpath("//img[contains(@src, 'facebook.com/tr?id=')]")
69
70 if len(quant) > 0:
71 for item in quant:
72 attribute = item.attrib['src']
73 index = attribute.index("p-")
74 length = len(item.attrib['src'])
75 ids['quant'] = item.attrib['src'][index:length]
76 if len(ganal) > 0:
77 for item in ganal:
78 attribute = item.attrib['src']
79 index = attribute.index("UA-")
80 length = len(item.attrib['src'])
81 ids['ganal'] = item.attrib['src'][index:length]
82 if len(pixel) > 0:
83 for item in pixel:
84 attribute = item.attrib['src']
85 index = attribute.index("id=")
86 index2 = attribute.index("&ev=")
87 ids['pixel'] = item.attrib['src'][index:index2]
88 except:
89 pass
90 return ids
91
92
93def scrape_ids(sites_path):
94 site_ids = dict()
95 sites_list = open_sites_csv(sites_path)
96 for site in sites_list:
97 ids = scrape_website_for_ids(site)
98 site_ids[site] = ids
99 output_tracking_ids(site_ids)
100
101
102def output_tracking_ids(site_ids):
103 with open('sites-tracking-ids.csv', 'w', newline='') as csvfile:
104 output = csv.writer(csvfile, dialect='excel')
105 output.writerow(["site", "google-analytics", "fb-pixel", "quantserve"])
106 for key in site_ids:
107 output.writerow([key, site_ids[key]['ganal'], site_ids[key]['pixel'], site_ids[key]['quant']])
108 return
109
110
111def output_unique_ids_lists(sites_tracking_csv_path):
112 sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
113
114 gids = set()
115 pids = set()
116 qids = set()
117 for site in sites_ids:
118 if "ganal" in sites_ids[site]:
119 tag = sites_ids[site]['ganal']
120 if tag is not None and tag != '':
121 property_index = tag.find("-", 4)
122 gid_without_property = tag
123 if property_index > -1:
124 gid_without_property = tag[0:property_index]
125 gid_without_property += "-"
126 gids.add(gid_without_property)
127 if "quant" in sites_ids[site]:
128 tag = sites_ids[site]['quant']
129 if tag is not None and tag != '':
130 qids.add(tag)
131 if "pixel" in sites_ids[site]:
132 tag = sites_ids[site]['pixel']
133 if tag is not None and tag != '':
134 pids.add(tag)
135
136 with open('unique-gids.csv', 'w', newline='') as csvfile:
137 output = csv.writer(csvfile, dialect='excel')
138 output.writerow(["google-analytics"])
139 for gid in gids:
140 output.writerow([gid])
141
142 with open('unique-pids.csv', 'w', newline='') as csvfile:
143 output = csv.writer(csvfile, dialect='excel')
144 output.writerow(["facebook-pixel"])
145 for pid in pids:
146 output.writerow([pid])
147
148 with open('unique-qids.csv', 'w', newline='') as csvfile:
149 output = csv.writer(csvfile, dialect='excel')
150 output.writerow(["quantserve-id"])
151 for qid in qids:
152 output.writerow([qid])
153
154
155def open_sites_tracking_ids_csv(path):
156 sites_ids = dict()
157 with open(path) as csvfile:
158 #skip the header
159 next(csvfile)
160 csvreader = csv.reader(csvfile, dialect='excel')
161 for row in csvreader:
162 ids = {'quant': row[3], 'pixel': row[2], 'ganal': row[1]}
163 sites_ids[row[0]] = ids
164 return sites_ids
165
166
167def scan_spy_on_web(sites_tracking_csv_path, api_key):
168 sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
169
170 #unique list of analytics ids
171 analytics_ids = set()
172 for site in sites_ids:
173 if "ganal" in sites_ids[site]:
174 tag = sites_ids[site]['ganal']
175 if tag is not None and tag != '':
176 property_index = tag.find("-", 4)
177 gid_without_property = tag
178 if property_index > -1:
179 gid_without_property = tag[0:property_index]
180 gid_without_property += "-"
181 analytics_ids.add(gid_without_property)
182
183 json_list = []
184 for gid in analytics_ids:
185 print("Querying for GID: " + gid)
186 json = apis.spy_on_web(tag, api_key)
187 json_list.append(json)
188 output_analytics_json(json_list)
189
190
191def open_unique_id_file(path):
192 ids = list()
193 with open(path) as csvfile:
194 #skip the header
195 next(csvfile)
196 csvreader = csv.reader(csvfile, dialect='excel')
197 for row in csvreader:
198 ids.append(row[0])
199 return ids
200
201
202def scan_publicwww(api_key):
203 gids = open_unique_id_file("./unique-gids.csv")
204 pids = open_unique_id_file("./unique-pids.csv")
205 qids = open_unique_id_file("./unique-qids.csv")
206
207 results = dict()
208 for pid in pids:
209 print("PID: " + pid)
210 links = apis.publicwww(pid, api_key)
211 results[pid] = links
212 for gid in gids:
213 print("GID: " + gid)
214 links = apis.publicwww(gid)
215 results[gid] = links
216 for qid in qids:
217 print("QID: " + qid)
218 links = apis.publicwww(qid)
219 results[qid] = links
220
221 for key in results:
222 links = results[key]
223 print("site: " + key)
224 print(*links)
225
226 with open('publicwww.csv', 'w', newline='') as csvfile:
227 output = csv.writer(csvfile, dialect='excel')
228 output.writerow(["site", "links"])
229 for key in results:
230 output.writerow([key, " ".join(results[key])])
231 return
232
233
234def output_analytics_json(json_list):
235 with open("api-scan-output.txt", 'w') as txtfile:
236 for js in json_list:
237 txtfile.write(json.dumps(js, indent=4, sort_keys=True))
238
239
240def output_unique_www():
241 unique_links = set()
242 with open("./publicwww.csv") as csvfile:
243 #skip the header
244 next(csvfile)
245 csvreader = csv.reader(csvfile, dialect='excel')
246 for row in csvreader:
247 site = row[0]
248 links = row[1].split(" ")
249 for linkstr in links:
250
251 index = linkstr.rfind("/")
252 link = linkstr[0:index]
253
254 parsed = urlparse(link)
255 unique_links.add(parsed.netloc)
256
257 for link in unique_links:
258 print(link)
259
260
261def output_site_links_map(sitespath):
262 sites = open_sites_csv(sitespath)
263
264 site_map = dict()
265 for site in sites:
266 if site is None or site == '':
267 continue
268
269 #lynx -listonly -dump <site>
270 command = 'lynx -listonly -dump ' + site
271 #check = true checks for non 0 ret code and raises exp
272 completed = subprocess.run(command, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
273 output = completed.stdout
274
275 site_links = set()
276 for line in output.split("\n"):
277 if line.find("http") < 0:
278 continue
279 line = line.strip()
280 link = line.split(" ")[1]
281 site_links.add(link)
282
283 site_map[site] = site_links
284
285 with open('link_map.csv', 'w', newline='') as csvfile:
286 output = csv.writer(csvfile, dialect='excel')
287 output.writerow(["site", "links"])
288 for site in site_map:
289 output.writerow([site, " ".join(site_map[site])])
290
291
292def count_links():
293 link_map = dict()
294 with open("link_map.csv") as csvfile:
295 #skip the 'domain' header
296 next(csvfile)
297 csvreader = csv.reader(csvfile, dialect='excel')
298 for row in csvreader:
299 site = row[0]
300 links = row[1].split(" ")
301 link_map[site] = links
302
303 site_counts = dict()
304 for site in link_map:
305 for link in link_map[site]:
306 if link in site_counts.keys():
307 count = site_counts[link]
308 count = count + 1
309 site_counts[link] = count
310 else:
311 count = 1
312 site_counts[link] = count
313
314
315 sorted_sites = {k: v for k, v in sorted(site_counts.items(), key=lambda item: item[1])}
316 for site in sorted_sites:
317 print("site: " + site + " " + "count: " + str(sorted_sites[site]))
318
319
320def main():
321 args = parse_arguments()
322 mode = args.mode[0]
323 if mode == 'scrape':
324 scrape_ids(args.csv)
325 elif mode == 'apis':
326 scan_apis(args.tracking, args.key)
327 elif mode == 'unique':
328 output_unique_ids_lists(args.tracking)
329 elif mode == 'lynx':
330 output_site_links_map(args.csv)
331 elif mode == 'lynxcount':
332 count_links()
333 elif mode == 'publicwww':
334 scan_publicwww(args.key)
335 elif mode == 'uniquewww':
336 output_unique_www()
337
338
339if __name__ == '__main__':
340 main()