tP4Xspdp

· 6 years ago · Mar 05, 2020, 03:52 AM
1import csv
2import json
3import requests
4import argparse
5from lxml import html
6import apis
7
8
9HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
10
11
12def parse_arguments():
13    ap = argparse.ArgumentParser()
14    #Arguments for tracking scan
15    ap.add_argument("mode", nargs=1)
16    ap.add_argument("-c", "--csv", required=False, help="Input file sites.csv's path.")
17    #Arguments for api scan
18    ap.add_argument("-t", "--tracking", required=False, help="Input file sites-tracking-ids.csv's path.")
19    ap.add_argument("-k", "--key", required=False, help="API key.")
20    args = ap.parse_args()
21
22    mode = args.mode[0]
23    if mode == 'scrape' and args.csv and args.tracking is None:
24        pass
25    elif mode == 'apis' and args.tracking is not None and args.key is not None and args.csv is None:
26        pass
27    else:
28        ap.error("Usage: python scraper.py <scrape || apis> <csv || tracking>")
29        exit(-1)
30    return args
31
32
33def open_sites_csv(path):
34    sites = []
35    with open(path) as csvfile:
36        #skip the 'domain' header
37        next(csvfile)
38        csvreader = csv.reader(csvfile, dialect='excel')
39        for row in csvreader:
40            sites.append(row[1])
41    return sites
42
43
44def scrape_website_for_ids(url):
45    print("Scraping: " + url)
46    nsite = "http://" + url
47    ids = {'quant': '', 'ganal': '', 'pixel': ''}
48    try:
49        page = requests.get(nsite, headers=HEADERS)
50        tree = html.fromstring(page.content)
51
52        quant = tree.xpath("//img[contains(@src, 'pixel.quantserve')]")
53        ganal = tree.xpath("//script[contains(@src, 'googletag')]")
54        pixel = tree.xpath("//img[contains(@src, 'facebook.com/tr?id=')]")
55
56        if len(quant) > 0:
57            for item in quant:
58                attribute = item.attrib['src']
59                index = attribute.index("p-")
60                length = len(item.attrib['src'])
61                ids['quant'] = item.attrib['src'][index:length]
62        if len(ganal) > 0:
63            for item in ganal:
64                attribute = item.attrib['src']
65                index = attribute.index("UA-")
66                length = len(item.attrib['src'])
67                ids['ganal'] = item.attrib['src'][index:length]
68        if len(pixel) > 0:
69            for item in pixel:
70                attribute = item.attrib['src']
71                index = attribute.index("id=")
72                index2 = attribute.index("&ev=")
73                ids['pixel'] = item.attrib['src'][index:index2]
74    except:
75        pass
76    return ids
77
78
79def scrape_ids(sites_path):
80    site_ids = dict()
81    sites_list = open_sites_csv(sites_path)
82    for site in sites_list:
83        ids = scrape_website_for_ids(site)
84        site_ids[site] = ids
85    output_tracking_ids(site_ids)
86
87
88def output_tracking_ids(site_ids):
89    with open('sites-tracking-ids.csv', 'w', newline='') as csvfile:
90        output = csv.writer(csvfile, dialect='excel')
91        output.writerow(["site", "google-analytics", "fb-pixel", "quantserve"])
92        for key in site_ids:
93            output.writerow([key, site_ids[key]['ganal'], site_ids[key]['pixel'], site_ids[key]['quant']])
94    return
95
96
97def open_sites_tracking_ids_csv(path):
98    sites_ids = dict() 
99    with open(path) as csvfile:
100        #skip the header
101        next(csvfile)
102        csvreader = csv.reader(csvfile, dialect='excel')
103        for row in csvreader:
104            ids = {'quant': row[3], 'pixel': row[2], 'ganal': row[1]} 
105            sites_ids[row[0]] = ids
106    return sites_ids
107
108
109def scan_apis(sites_tracking_csv_path, api_key):
110    sites_ids = open_sites_tracking_ids_csv(sites_tracking_csv_path)
111
112    #unique list of analytics ids
113    analytics_ids = set() 
114    for site in sites_ids:
115        if "ganal" in sites_ids[site]:
116            tag = sites_ids[site]['ganal']
117            if tag is not None and tag != '':
118                property_index = tag.find("-", 4)
119                gid_without_property = tag
120                if property_index > -1: 
121                    gid_without_property = tag[0:property_index]
122                analytics_ids.add(gid_without_property)
123
124    json_list = []
125    for gid in analytics_ids:
126        print("Querying for GID: " + gid)
127        json = apis.spy_on_web(tag, api_key)
128        json_list.append(json)
129    output_analytics_json(json_list)
130
131def output_analytics_json(json_list):
132    with open("api-scan-output.txt", 'w') as txtfile:
133        for js in json_list:
134            txtfile.write(json.dumps(js, indent=4, sort_keys=True))
135
136def main():
137    args = parse_arguments()
138    mode = args.mode[0]
139    if mode == 'scrape':
140        scrape_ids(args.csv)
141    elif mode == 'apis':
142        scan_apis(args.tracking, args.key)
143
144
145if __name__ == '__main__':
146    main()