· 7 years ago · Feb 15, 2019, 07:56 PM
1#from tinydb import TinyDB, Query
2from bs4 import BeautifulSoup
3from splinter import Browser
4from time import sleep
5import string
6import ujson
7from halo import Halo
8import sqlite3
9
10conn = sqlite3.connect('data.db')
11
12create_table = """ CREATE TABLE IF NOT EXISTS objects (
13 name text NOT NULL,
14 business text NOT NULL,
15 email text NOT NULL,
16 link text NOT NULL,
17 location text NOT NULL,
18 socials text NOT NULL
19 ); """
20
21conn.execute(create_table)
22
23user = 'XXXX'
24passwd = 'XXXX'
25
26per_page = 1000
27
28#db = TinyDB('db.json')
29#query = Query()
30
31spinner = Halo(text='Booting up...', spinner='dots')
32spinner.start()
33
34
35def lovely_soup(content):
36 return BeautifulSoup(content, 'lxml')
37
38
39def do_old_db(name, business, email, link, location, socials):
40 if not db.search(query.name == name):
41 db.insert({'name': name, 'business': business, 'email': email,
42 'link': link, 'location': location, 'socials': socials})
43 return True
44 else:
45 return False
46
47
48def do_db(name, business, email, link, location, socials):
49 conn.execute("INSERT INTO objects (name, business, email, link, location, socials) VALUES (?, ?, ?, ?, ?, ?)",
50 (name, business, email, link, str(location), str(socials)))
51 return True
52
53
54def get_letters():
55 letters = list(string.ascii_lowercase)
56 letters.extend([i + b for i in letters for b in letters])
57 return letters
58
59
60with Browser('chrome', headless=True, incognito=True) as browser:
61 login = 'https://www.angelmatch.io/users/sign_in'
62 spinner.text = 'Logging in'
63 browser.visit(login)
64 browser.fill('user[email]', user)
65 browser.fill('user[password]', passwd)
66 browser.find_by_xpath('//*[@id="new_user"]/input[3]').click()
67
68 c = 0
69 new = 0
70
71 alt = ['dq', 'dr', 'ds', 'dt', 'du', 'dv', 'dw', 'dx', 'dy', 'dz', 'ea', 'eb', 'ec', 'ed', 'ee', 'ef', 'eg', 'eh', 'ei', 'ej', 'ek', 'el', 'em', 'en', 'eo', 'ep', 'eq', 'er', 'es', 'et', 'eu', 'ev', 'ew', 'ex', 'ey', 'ez', 'fa', 'fb', 'fc', 'fd', 'fe', 'ff', 'fg', 'fh', 'fi', 'fj', 'fk', 'fl', 'fm', 'fn', 'fo', 'fp', 'fq', 'fr', 'fs', 'ft', 'fu', 'fv', 'fw', 'fx', 'fy', 'fz', 'ga', 'gb', 'gc', 'gd', 'ge', 'gf', 'gg', 'gh', 'gi', 'gj', 'gk', 'gl', 'gm', 'gn', 'go', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', 'gv', 'gw', 'gx', 'gy', 'gz', 'ha', 'hb', 'hc', 'hd', 'he', 'hf', 'hg', 'hh', 'hi', 'hj', 'hk', 'hl', 'hm', 'hn', 'ho', 'hp', 'hq', 'hr', 'hs', 'ht', 'hu', 'hv', 'hw', 'hx', 'hy', 'hz', 'ia', 'ib', 'ic', 'id', 'ie', 'if', 'ig', 'ih', 'ii', 'ij', 'ik', 'il', 'im', 'in', 'io', 'ip', 'iq', 'ir', 'is', 'it', 'iu', 'iv', 'iw', 'ix', 'iy', 'iz', 'ja', 'jb', 'jc', 'jd', 'je', 'jf', 'jg', 'jh', 'ji', 'jj', 'jk', 'jl', 'jm', 'jn', 'jo', 'jp', 'jq', 'jr', 'js', 'jt', 'ju', 'jv', 'jw', 'jx', 'jy', 'jz', 'ka', 'kb', 'kc', 'kd', 'ke', 'kf', 'kg', 'kh', 'ki', 'kj', 'kk', 'kl', 'km', 'kn', 'ko', 'kp', 'kq', 'kr', 'ks', 'kt', 'ku', 'kv', 'kw', 'kx', 'ky', 'kz', 'la', 'lb', 'lc', 'ld', 'le', 'lf', 'lg', 'lh', 'li', 'lj', 'lk', 'll', 'lm', 'ln', 'lo', 'lp', 'lq', 'lr', 'ls', 'lt', 'lu', 'lv', 'lw', 'lx', 'ly', 'lz', 'ma', 'mb', 'mc', 'md', 'me', 'mf', 'mg', 'mh', 'mi', 'mj', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nb', 'nc', 'nd', 'ne', 'nf', 'ng', 'nh', 'ni', 'nj', 'nk', 'nl', 'nm', 'nn', 'no', 'np', 'nq', 'nr', 'ns', 'nt', 'nu', 'nv', 'nw', 'nx', 'ny', 'nz', 'oa', 'ob', 'oc', 'od', 'oe', 'of', 'og', 'oh', 'oi', 'oj', 'ok', 'ol', 'om', 'on', 'oo', 'op', 'oq', 'or', 'os', 'ot', 'ou', 'ov', 'ow', 'ox', 'oy', 'oz', 'pa', 'pb', 'pc', 'pd', 'pe', 'pf', 'pg', 'ph', 'pi', 'pj', 'pk', 'pl', 'pm', 'pn', 'po', 'pp', 'pq', 'pr', 'ps', 'pt', 'pu', 'pv', 'pw', 'px', 'py', 'pz', 'qa', 'qb', 'qc', 'qd', 'qe', 'qf', 'qg', 'qh', 'qi', 'qj', 'qk', 'ql', 'qm', 'qn', 'qo', 'qp', 'qq', 'qr', 'qs', 'qt', 'qu', 'qv', 'qw', 'qx', 'qy', 'qz', 'ra', 'rb', 'rc', 'rd', 're', 'rf', 'rg', 'rh', 'ri', 'rj', 'rk', 'rl', 'rm', 'rn', 'ro', 'rp', 'rq', 'rr', 'rs', 'rt', 'ru', 'rv', 'rw', 'rx', 'ry', 'rz', 'sa', 'sb', 'sc', 'sd', 'se', 'sf', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sp', 'sq', 'sr', 'ss', 'st', 'su', 'sv', 'sw', 'sx', 'sy', 'sz', 'ta', 'tb', 'tc', 'td', 'te', 'tf', 'tg', 'th', 'ti', 'tj', 'tk', 'tl', 'tm', 'tn', 'to', 'tp', 'tq', 'tr', 'ts', 'tt', 'tu', 'tv', 'tw', 'tx', 'ty', 'tz', 'ua', 'ub', 'uc', 'ud', 'ue', 'uf', 'ug', 'uh', 'ui', 'uj', 'uk', 'ul', 'um', 'un', 'uo', 'up', 'uq', 'ur', 'us', 'ut', 'uu', 'uv', 'uw', 'ux', 'uy', 'uz', 'va', 'vb', 'vc', 'vd', 've', 'vf', 'vg', 'vh', 'vi', 'vj', 'vk', 'vl', 'vm', 'vn', 'vo', 'vp', 'vq', 'vr', 'vs', 'vt', 'vu', 'vv', 'vw', 'vx', 'vy', 'vz', 'wa', 'wb', 'wc', 'wd', 'we', 'wf', 'wg', 'wh', 'wi', 'wj', 'wk', 'wl', 'wm', 'wn', 'wo', 'wp', 'wq', 'wr', 'ws', 'wt', 'wu', 'wv', 'ww', 'wx', 'wy', 'wz', 'xa', 'xb', 'xc', 'xd', 'xe', 'xf', 'xg', 'xh', 'xi', 'xj', 'xk', 'xl', 'xm', 'xn', 'xo', 'xp', 'xq', 'xr', 'xs', 'xt', 'xu', 'xv', 'xw', 'xx', 'xy', 'xz', 'ya', 'yb', 'yc', 'yd', 'ye', 'yf', 'yg', 'yh', 'yi', 'yj', 'yk', 'yl', 'ym', 'yn', 'yo', 'yp', 'yq', 'yr', 'ys', 'yt', 'yu', 'yv', 'yw', 'yx', 'yy', 'yz', 'za', 'zb', 'zc', 'zd', 'ze', 'zf', 'zg', 'zh', 'zi', 'zj', 'zk', 'zl', 'zm', 'zn', 'zo', 'zp', 'zq', 'zr', 'zs', 'zt', 'zu', 'zv', 'zw', 'zx', 'zy', 'zz']
72
73 for x in alt:
74 try:
75 url = f'https://www.angelmatch.io/dashboard?q={x}&hPP={per_page}&idx=investors&p=0'
76 browser.visit(url)
77 spinner.text = f'{new}/{x}: {c} Loading page'
78
79 dead = 0
80 go = 0
81 while not go:
82 content = browser.html
83 soup = lovely_soup(content)
84 all = soup.find('div', {'class': 'name-company'})
85 c += 1
86 spinner.text = f'{new}/{x}: {c}: Waiting...'
87 sleep(1)
88 if all:
89 spinner.text = f'{new}/{x}: {c} Our soup is ready...'
90 go = 1
91 else:
92 dead += 1
93 if dead > 20:
94 go = 1
95
96 content = browser.html
97 soup = lovely_soup(content)
98 spinner.text = f'{new}/{x}: {c} Got lovely soup'
99 all = soup.find('div', {'id': 'hits'})
100 hits = all.findAll('div', {'class': 'hit'})
101
102 for hit in hits:
103 names = hit.find(
104 'div', {'class': 'name-company'}).findAll('h1')
105 name = names[0].text
106 business = names[1].text
107 email = hit.find('a', {'id': 'email'}).find(
108 'span').text.strip()
109 location = hit.find('img', {'class': 'icon'}
110 ).findNext('span').text.strip()
111 link = hit.find('img', {'class': 'icon'}).findNext(
112 'span').findNext('a').text.strip()
113 interests = hit.find(text='Investment focuses: ').findNext(
114 'span').findAll('span')
115
116 social = hit.find(
117 'div', {'class': 'social-icons'}).findAll('a')
118
119 socials = []
120 for soc in social:
121 socials.append(soc['href'])
122
123 if not link:
124 link = 'N/A'
125
126 spinner.text = f'{new}/{x}: {c} {name}'
127
128 conn.execute("INSERT INTO objects (name, business, email, link, location, socials) VALUES (?, ?, ?, ?, ?, ?)",
129 (name, business, email, link, str(location), str(socials)))
130 new += 1
131 except Exception as e:
132 #print(e)
133 pass
134 conn.commit()
135
136 conn.close()
137 spinner.succeed('All done')