· 7 years ago · Feb 15, 2019, 07:58 PM
1#from tinydb import TinyDB, Query
2from bs4 import BeautifulSoup
3from splinter import Browser
4from time import sleep
5import string
6import ujson
7from halo import Halo
8import sqlite3
9
10conn = sqlite3.connect('data.db')
11
12create_table = """ CREATE TABLE IF NOT EXISTS objects (
13 name text NOT NULL,
14 business text NOT NULL,
15 email text NOT NULL,
16 link text NOT NULL,
17 location text NOT NULL,
18 socials text NOT NULL
19 ); """
20
21conn.execute(create_table)
22
23user = 'XXXX'
24passwd = 'XXXX'
25
26per_page = 1000
27
28spinner = Halo(text='Booting up...', spinner='dots')
29spinner.start()
30
31
32def lovely_soup(content):
33 return BeautifulSoup(content, 'lxml')
34
35
36def do_old_db(name, business, email, link, location, socials):
37 if not db.search(query.name == name):
38 db.insert({'name': name, 'business': business, 'email': email,
39 'link': link, 'location': location, 'socials': socials})
40 return True
41 else:
42 return False
43
44
45def do_db(name, business, email, link, location, socials):
46 conn.execute("INSERT INTO objects (name, business, email, link, location, socials) VALUES (?, ?, ?, ?, ?, ?)",
47 (name, business, email, link, str(location), str(socials)))
48 return True
49
50
51def get_letters():
52 letters = list(string.ascii_lowercase)
53 letters.extend([i + b for i in letters for b in letters])
54 return letters
55
56
57with Browser('chrome', headless=True, incognito=True) as browser:
58 login = 'https://www.angelmatch.io/users/sign_in'
59 spinner.text = 'Logging in'
60 browser.visit(login)
61 browser.fill('user[email]', user)
62 browser.fill('user[password]', passwd)
63 browser.find_by_xpath('//*[@id="new_user"]/input[3]').click()
64
65 new = 0
66
67 for x in get_letters():
68 try:
69 url = f'https://www.angelmatch.io/dashboard?q={x}&hPP={per_page}&idx=investors&p=0'
70 browser.visit(url)
71 spinner.text = f'{new}/{x}: Loading page'
72
73 dead = 0
74 go = 0
75 while not go:
76 content = browser.html
77 soup = lovely_soup(content)
78 all = soup.find('div', {'class': 'name-company'})
79 c += 1
80 spinner.text = f'{new}/{x}: Waiting...'
81 sleep(1)
82 if all:
83 spinner.text = f'{new}/{x}: Our soup is ready...'
84 go = 1
85 else:
86 dead += 1
87 if dead > 20:
88 go = 1
89
90 content = browser.html
91 soup = lovely_soup(content)
92 spinner.text = f'{new}/{x}: Got lovely soup'
93 all = soup.find('div', {'id': 'hits'})
94 hits = all.findAll('div', {'class': 'hit'})
95
96 for hit in hits:
97 names = hit.find(
98 'div', {'class': 'name-company'}).findAll('h1')
99 name = names[0].text
100 business = names[1].text
101 email = hit.find('a', {'id': 'email'}).find(
102 'span').text.strip()
103 location = hit.find('img', {'class': 'icon'}
104 ).findNext('span').text.strip()
105 link = hit.find('img', {'class': 'icon'}).findNext(
106 'span').findNext('a').text.strip()
107 interests = hit.find(text='Investment focuses: ').findNext(
108 'span').findAll('span')
109
110 social = hit.find(
111 'div', {'class': 'social-icons'}).findAll('a')
112
113 socials = []
114 for soc in social:
115 socials.append(soc['href'])
116
117 if not link:
118 link = 'N/A'
119
120 spinner.text = f'{new}/{x}: {name}'
121
122 conn.execute("INSERT INTO objects (name, business, email, link, location, socials) VALUES (?, ?, ?, ?, ?, ?)",
123 (name, business, email, link, str(location), str(socials)))
124 new += 1
125 except Exception as e:
126 #print(e)
127 pass
128 conn.commit()
129
130 conn.close()
131 spinner.succeed('All done')