· 6 years ago · May 19, 2019, 05:18 AM
1#Mohak Buch
2#Koshi Huynh
3
4# importing modules
5import urllib.request as ur
6import requests
7from bs4 import BeautifulSoup
8import re
9import time
10import json
11import sqlite3
12'''
13This code extracts a table from a website with different high tech jobs and information about them. This information
14is extracted from a link within each job listed. The information given is used to create a database.
15'''
16
17page = requests.get('https://www.bls.gov/ooh/computer-and-information-technology/home.htm')
18soup = BeautifulSoup(page.content, "lxml")
19master = {}
20for tag in soup.find_all('h4'):
21 for link in tag.find_all('a', href=True):
22 currentOccupation = {}
23 tableURL = requests.get('https://www.bls.gov' + link.get('href'))
24 soup2 = BeautifulSoup(tableURL.content, "lxml")
25 tag2 = soup2.find_all('table')
26
27 keys = tag2[0].find_all('th') # Keys for currentOccupation (keys[0] is occupation title)
28
29 values = tag2[0].find_all('td') # Values for currentOccupation
30
31 splitYearlySal = re.split(r'\s', values[0].text.strip())
32
33 currentOccupation[re.sub(' +', ' ', keys[1].text)] = int(re.sub('[^-^0-9]', '', splitYearlySal[
34 0])) # YearlySalary/MedianPay added to currentOccupation dictionary as int
35 # print(currentOccupation)
36 for i in range(1, 4):
37 currentOccupation[keys[i + 1].text.strip()] = values[
38 i].text.strip() # EntryEducation, WorkExp, OnTheJobTraining added to currentOccupation dictionary
39
40 for i in range(4, 7):
41 currentOccupation[keys[i + 1].text.strip()] = int(re.sub('[^-^0-9]', '', values[
42 i].text.strip())) # Ensures last three values of currentOccupation are ints
43
44 if values[
45 1].text.strip() == 'See How to Become One': # Fixes EntryLvlEducation -> Certification if SeeHowToBecomeOne
46 currentOccupation[keys[2].text.strip()] = "Certification"
47 else:
48 currentOccupation[keys[2].text.strip()] = values[1].text.strip()
49
50 master[keys[0].text[21:]] = currentOccupation # Adds currentOccupation to master
51
52with open('data.json', 'w') as fh:
53 json.dump(master, fh, indent=3) # Dump master
54
55with open('data.json', 'r') as fr:
56 data = json.load(fr)
57temp = [header for header in list(data.values())[0].keys()]
58titles = tuple(temp)
59con = sqlite3.connect('jobsDatabase.db')
60cur = con.cursor()
61
62# fields table
63cur.execute("DROP TABLE IF EXISTS FieldsDB")
64cur.execute('''CREATE TABLE FieldsDB(
65 medPay TEXT,
66 education TEXT,
67 workExperience TEXT,
68 jobTraining TEXT,
69 numJobs TEXT,
70 jobOutlook TEXT,
71 employmentChange TEXT)''')
72cur.execute('''INSERT INTO FieldsDB
73 VALUES
74 (?, ?, ?, ?, ?, ?, ?);''', titles)
75
76# Jobs table
77cur.execute("DROP TABLE IF EXISTS JobsDB")
78cur.execute('''CREATE TABLE JobsDB(
79 occupation TEXT,
80 medPay INTEGER,
81 education TEXT,
82 workExperience TEXT,
83 jobTraining TEXT,
84 numJobs INTEGER,
85 jobOutlook INTEGER,
86 employmentChange INTEGER)''')
87
88# education Table
89cur.execute("DROP TABLE IF EXISTS EducationDB")
90cur.execute('''CREATE TABLE EducationDB(
91 id INTEGER NOT NULL PRIMARY KEY,
92 degree TEXT UNIQUE ON CONFLICT IGNORE)''')
93#education data
94for occ in data:
95 occupData = [occ]
96 for value in data[occ].values():
97 occupData.append(value)
98
99 cur.execute("INSERT INTO EducationDB (degree) VALUES (?)", (list(data[occ].values())[1],))
100 cur.execute("SELECT id FROM EducationDB WHERE degree = ?", (occupData[2],))
101 eduID = cur.fetchone()[0]
102 occupData[2] = eduID
103
104 cur.execute('''INSERT INTO JobsDB
105 VALUES
106 (?, ?, ?, ?, ?, ?, ?, ?);''', list(occupData))
107
108con.commit()
109con.close()