TDqwR5bN

· 6 years ago · Mar 09, 2020, 02:14 AM
1import csv
2import json
3import requests
4import argparse
5from lxml import html
6import apis
7from urllib.parse import urlparse
8import subprocess
9
10
11HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
12
13
14def parse_arguments():
15    ap = argparse.ArgumentParser()
16    #Arguments for tracking scan
17    ap.add_argument("mode", nargs=1)
18    ap.add_argument("-c", "--csv", required=False, help="Input file sites.csv's path.")
19    #Arguments for api scan
20    ap.add_argument("-t", "--tracking", required=False, help="Input file sites-tracking-ids.csv's path.")
21    ap.add_argument("-k", "--key", required=False, help="API key.")
22
23    args = ap.parse_args()
24
25    mode = args.mode[0]
26    if mode == 'unique':
27        pass
28    elif mode == 'publicwww':
29        pass
30    elif mode == 'uniquewww':
31        pass
32    elif mode == 'lynx':
33        pass
34    elif mode == 'lynxcount':
35        pass
36    elif mode == 'scrape' and args.csv and args.tracking is None:
37        pass
38    elif mode == 'apis' and args.tracking is not None and args.key is not None and args.csv is None:
39        pass
40    else:
41        ap.error("Usage: python scraper.py <scrape || apis> <csv || tracking>")
42        exit(-1)
43
44    return args
45
46
47def open_sites_csv(path):
48    sites = []
49    with open(path) as csvfile:
50        #skip the 'domain' header
51        next(csvfile)
52        csvreader = csv.reader(csvfile, dialect='excel')
53        for row in csvreader:
54            sites.append(row[1])
55    return sites
56
57
58def scrape_website_for_ids(url):
59    print("Scraping: " + url)
60    nsite = "http://" + url
61    ids = {'quant': '', 'ganal': '', 'pixel': ''}
62    try:
63        page = requests.get(nsite, headers=HEADERS)
64        tree = html.fromstring(page.content)
65
66        quant = tree.xpath("//img[contains(@src, 'pixel.quantserve')]")
67        ganal = tree.xpath("//script[contains(@src, 'googletag')]")
68        pixel = tree.xpath("//img[contains(@src, 'facebook.com/tr?id=')]")
69
70        if len(quant) > 0:
71            for item in quant:
72                attribute = item.attrib['src']
73                index = attribute.index("p-")
74                length = len(item.attrib['src'])
75                ids['quant'] = item.attrib['src'][index:length]
76        if len(ganal) > 0:
77            for item in ganal:
78                attribute = item.attrib['src']
79                index = attribute.index("UA-")
80                length = len(item.attrib['src'])
81                ids['ganal'] = item.attrib['src'][index:length]
82        if len(pixel) > 0:
83            for item in pixel:
84                attribute = item.attrib['src']
85                index = attribute.index("id=")
86                index2 = attribute.index("&ev=")
87                ids['pixel'] = item.attrib['src'][index:index2]
88    except:
89        pass
90    return ids
91
92
93def scrape_ids(sites_path):
94    site_ids = dict()
95    sites_list = open_sites_csv(sites_path)
96    for site in sites_list:
97        ids = scrape_website_for_ids(site)
98        site_ids[site] = ids
99    output_tracking_ids(site_ids)
100
101
102def output_tracking_ids(site_ids):
103    with open('sites-tracking-ids.csv', 'w', newline='') as csvfile:
104        output = csv.writer(csvfile, dialect='excel')
105        output.writerow(["site", "google-analytics", "fb-pixel", "quantserve"])
106        for key in site_ids:
107            output.writerow([key, site_ids[key]['ganal'], site_ids[key]['pixel'], site_ids[key]['quant']])
108    return
109
110
111def output_unique_ids_lists(sites_tracking_csv_path):
112    sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
113
114    gids = set() 
115    pids = set()
116    qids = set()
117    for site in sites_ids:
118        if "ganal" in sites_ids[site]:
119            tag = sites_ids[site]['ganal']
120            if tag is not None and tag != '':
121                property_index = tag.find("-", 4)
122                gid_without_property = tag
123                if property_index > -1: 
124                    gid_without_property = tag[0:property_index]
125                    gid_without_property += "-"
126                gids.add(gid_without_property)
127        if "quant" in sites_ids[site]:
128            tag = sites_ids[site]['quant']
129            if tag is not None and tag != '':
130                qids.add(tag)
131        if "pixel" in sites_ids[site]:
132            tag = sites_ids[site]['pixel']
133            if tag is not None and tag != '':
134                pids.add(tag)
135
136    with open('unique-gids.csv', 'w', newline='') as csvfile:
137        output = csv.writer(csvfile, dialect='excel')
138        output.writerow(["google-analytics"])
139        for gid in gids:
140            output.writerow([gid])
141
142    with open('unique-pids.csv', 'w', newline='') as csvfile:
143        output = csv.writer(csvfile, dialect='excel')
144        output.writerow(["facebook-pixel"])
145        for pid in pids:
146            output.writerow([pid])
147
148    with open('unique-qids.csv', 'w', newline='') as csvfile:
149        output = csv.writer(csvfile, dialect='excel')
150        output.writerow(["quantserve-id"])
151        for qid in qids:
152            output.writerow([qid])
153
154
155def open_sites_tracking_ids_csv(path):
156    sites_ids = dict() 
157    with open(path) as csvfile:
158        #skip the header
159        next(csvfile)
160        csvreader = csv.reader(csvfile, dialect='excel')
161        for row in csvreader:
162            ids = {'quant': row[3], 'pixel': row[2], 'ganal': row[1]} 
163            sites_ids[row[0]] = ids
164    return sites_ids
165
166
167def scan_spy_on_web(sites_tracking_csv_path, api_key):
168    sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
169
170    #unique list of analytics ids
171    analytics_ids = set() 
172    for site in sites_ids:
173        if "ganal" in sites_ids[site]:
174            tag = sites_ids[site]['ganal']
175            if tag is not None and tag != '':
176                property_index = tag.find("-", 4)
177                gid_without_property = tag
178                if property_index > -1: 
179                    gid_without_property = tag[0:property_index]
180                    gid_without_property += "-"
181                analytics_ids.add(gid_without_property)
182
183    json_list = []
184    for gid in analytics_ids:
185        print("Querying for GID: " + gid)
186        json = apis.spy_on_web(tag, api_key)
187        json_list.append(json)
188    output_analytics_json(json_list)
189
190
191def open_unique_id_file(path):
192    ids = list()
193    with open(path) as csvfile:
194        #skip the header
195        next(csvfile)
196        csvreader = csv.reader(csvfile, dialect='excel')
197        for row in csvreader:
198            ids.append(row[0])
199    return ids
200
201
202def scan_publicwww(api_key):
203    gids = open_unique_id_file("./unique-gids.csv")
204    pids = open_unique_id_file("./unique-pids.csv")
205    qids = open_unique_id_file("./unique-qids.csv")
206
207    results = dict()
208    for pid in pids:
209        print("PID: " + pid)
210        links = apis.publicwww(pid, api_key)
211        results[pid] = links
212    for gid in gids:
213        print("GID: " + gid)
214        links = apis.publicwww(gid)
215        results[gid] = links
216    for qid in qids:
217        print("QID: " + qid)
218        links = apis.publicwww(qid)
219        results[qid] = links
220
221    for key in results:
222        links = results[key]
223        print("site: " + key)
224        print(*links)
225
226    with open('publicwww.csv', 'w', newline='') as csvfile:
227        output = csv.writer(csvfile, dialect='excel')
228        output.writerow(["site", "links"])
229        for key in results:
230            output.writerow([key, " ".join(results[key])])
231    return
232
233
234def output_analytics_json(json_list):
235    with open("api-scan-output.txt", 'w') as txtfile:
236        for js in json_list:
237            txtfile.write(json.dumps(js, indent=4, sort_keys=True))
238
239
240def output_unique_www():
241    unique_links = set()
242    with open("./publicwww.csv") as csvfile:
243        #skip the header
244        next(csvfile)
245        csvreader = csv.reader(csvfile, dialect='excel')
246        for row in csvreader:
247            site = row[0] 
248            links = row[1].split(" ")
249            for linkstr in links:
250
251                index = linkstr.rfind("/")
252                link = linkstr[0:index]
253
254                parsed = urlparse(link)
255                unique_links.add(parsed.netloc)
256
257    for link in unique_links:
258        print(link)
259
260
261def output_site_links_map(sitespath):
262    sites = open_sites_csv(sitespath)
263
264    site_map = dict()
265    for site in sites:
266        if site is None or site == '':
267            continue
268
269        #lynx -listonly -dump <site>
270        command = 'lynx -listonly -dump ' + site
271        #check = true checks for non 0 ret code and raises exp
272        completed = subprocess.run(command, shell=True, stdout=subprocess.PIPE, universal_newlines=True)
273        output = completed.stdout
274
275        site_links = set()
276        for line in output.split("\n"):
277            if line.find("http") < 0:
278                continue
279            line = line.strip()
280            link = line.split(" ")[1]
281            site_links.add(link)
282
283        site_map[site] = site_links
284
285    with open('link_map.csv', 'w', newline='') as csvfile:
286        output = csv.writer(csvfile, dialect='excel')
287        output.writerow(["site", "links"])
288        for site in site_map:
289            output.writerow([site, " ".join(site_map[site])])
290    
291
292def count_links():
293    link_map = dict()
294    with open("link_map.csv") as csvfile:
295        #skip the 'domain' header
296        next(csvfile)
297        csvreader = csv.reader(csvfile, dialect='excel')
298        for row in csvreader:
299            site = row[0]
300            links = row[1].split(" ")
301            link_map[site] = links
302    
303    site_counts = dict()
304    for site in link_map:
305        for link in link_map[site]:
306            if link in site_counts.keys():
307                count = site_counts[link]
308                count = count + 1
309                site_counts[link] = count
310            else:
311                count = 1
312                site_counts[link] = count
313
314    
315    sorted_sites = {k: v for k, v in sorted(site_counts.items(), key=lambda item: item[1])}
316    for site in sorted_sites:
317        print("site: "  + site + " " + "count: " + str(sorted_sites[site]))
318
319
320def main():
321    args = parse_arguments()
322    mode = args.mode[0]
323    if mode == 'scrape':
324        scrape_ids(args.csv)
325    elif mode == 'apis':
326        scan_apis(args.tracking, args.key)
327    elif mode == 'unique':
328        output_unique_ids_lists(args.tracking)
329    elif mode == 'lynx':
330        output_site_links_map(args.csv)
331    elif mode == 'lynxcount':
332        count_links()
333    elif mode == 'publicwww':
334        scan_publicwww(args.key)
335    elif mode == 'uniquewww':
336        output_unique_www()
337
338
339if __name__ == '__main__':
340    main()