· 5 years ago · Mar 24, 2020, 04:46 PM
1query = """
2 DROP TABLE IF EXISTS {0}.sbb_geneve_departure_delay
3""".format(username)
4cur.execute(query)
5query = """
6 CREATE EXTERNAL TABLE {0}.sbb_geneve_departure_delay STORED AS ORC AS
7 SELECT betriebstag, fahrt_bezeichner, linien_id, linien_text,
8 (unix_timestamp(ab_prognose, '{1}') - unix_timestamp(abfahrtszeit, '{2}'))/60 AS departure_delay
9 FROM {0}.sbb_orc
10 WHERE lower(haltestellen_name) = 'genève' AND produkt_id = 'Zug' AND ab_prognose_status = 'REAL'
11 AND unix_timestamp(ab_prognose, 'dd.MM.yyyy HH:mm:ss') - unix_timestamp(abfahrtszeit, 'dd.MM.yyyy HH:mm') > 120
12""".format(username, actual_time_format, expected_time_format)
13cur.execute(query)
14
15query = """
16 DROP TABLE IF EXISTS {0}.sbb_lausanne_arrival_delay
17""".format(username)
18cur.execute(query)
19query = """
20 CREATE EXTERNAL TABLE {0}.sbb_lausanne_arrival_delay STORED AS ORC AS
21 SELECT betriebstag, fahrt_bezeichner, linien_id, linien_text,
22 (unix_timestamp(an_prognose, '{1}') - unix_timestamp(ankunftszeit, '{2}'))/60 AS arrival_delay
23 FROM {0}.sbb_orc
24 WHERE lower(haltestellen_name) = 'lausanne' AND produkt_id = 'Zug' AND an_prognose_status = 'REAL'
25""".format(username, actual_time_format, expected_time_format)
26cur.execute(query)
27
28query = """
29 DROP TABLE IF EXISTS {0}.sbb_lausanne_geneve_delay
30""".format(username)
31cur.execute(query)
32query = """
33 CREATE EXTERNAL TABLE {0}.sbb_lausanne_geneve_delay STORED AS ORC AS
34 SELECT L.betriebstag, L.fahrt_bezeichner, G.departure_delay, L.arrival_delay, G.linien_id, G.linien_text
35 FROM {0}.sbb_geneve_departure_delay G
36 JOIN {0}.sbb_lausanne_arrival_delay L
37 ON L.betriebstag = G.betriebstag AND L.fahrt_bezeichner = G.fahrt_bezeichner
38""".format(username)
39cur.execute(query)
40
41query = """
42 SELECT linien_id, linien_text, AVG(departure_delay) AS avg_dep_delay, COUNT(*) AS cnt
43 FROM {0}.sbb_lausanne_geneve_delay
44 GROUP BY linien_id, linien_text
45 HAVING cnt > 50
46 ORDER BY avg_dep_delay DESC
47 LIMIT 5
48""".format(username)
49pd.read_sql(query, conn)
50
51query = """
52 SELECT * FROM {0}.sbb_lausanne_geneve_delay
53 WHERE linien_id = '2510' OR linien_id = '1734'
54""".format(username)
55delay_data = pd.read_sql(query, conn)
56
57import seaborn as sns
58delay_data_ = delay_data[['sbb_lausanne_geneve_delay.departure_delay', 'sbb_lausanne_geneve_delay.arrival_delay', 'sbb_lausanne_geneve_delay.linien_id']]
59fig, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(18, 6))
60def plot_lines(names, ax, colors, limits, nbins):
61 for i in range(len(names)):
62 sns.distplot(delay_data_[delay_data_['sbb_lausanne_geneve_delay.linien_id'] == names[i]]['sbb_lausanne_geneve_delay.departure_delay'], color=colors[0],
63 label="Departure", kde=False, hist_kws={'alpha':.7}, ax=ax[i], bins=nbins)
64 sns.distplot(delay_data_[delay_data_['sbb_lausanne_geneve_delay.linien_id'] == names[i]]['sbb_lausanne_geneve_delay.arrival_delay'], color=colors[1],
65 label="Ariival", kde=False, hist_kws={'alpha':.7}, ax=ax[i], bins=nbins)
66 ax[i].set_title("Departure and arrival delays for line " + str(names[i]))
67 ax[i].set_xlim(*limits)
68 ax[i].grid(True)
69 ax[i].set_xlabel("Delay(min)")
70 ax[i].set_ylabel("Number")
71 ax[i].legend()
72
73colors = ["yellow", "dodgerblue"]
74plot_lines(delay_data_["sbb_lausanne_geneve_delay.linien_id"].unique(), [ax1, ax2], colors, (-5, 30), 30)
75
76We can see that IR trains (line 1734) can catch up and often have their arrival delays even below zero (arrive earlier) while IR15 trains (line 2510) usually can not.