· 5 years ago · Mar 29, 2020, 01:56 PM
1#!/usr/bin/env python
2# coding: utf-8
3
4import requests
5import time
6from bs4 import BeautifulSoup
7import re
8from datetime import datetime
9import json
10
11import os
12import sqlalchemy
13import psycopg2
14import numpy as np
15
16from flask import Flask
17RicercaImmobili = Flask(__name__)
18
19@RicercaImmobili.route("/")
20
21def main():
22 tipo_sql = {str: 'text',
23 dict: 'json',
24 list: 'text[]',
25 int: 'bigint',
26 float: 'real',
27 bool: 'boolean',
28 datetime: 'timestamp without time zone'
29 }
30
31 def flatten_dict(dd, separator='_', prefix=''):
32 return { prefix + separator + k if prefix else k : v
33 for kk,vv in dd.items()
34 for k, v in flatten_dict(vv, separator, kk).items()
35 } if isinstance(dd, dict) else { prefix : dd }
36
37 def up_to_underscore(string):
38 return re.sub(r'[A-Z]', lambda m: '_' + m.group(0).lower(), string)
39
40 def estrai_dati(page_url,colonne_utili):
41 # Accetta un url del tipo 'https://www.immobiliare.it/annunci/79617755/'
42 # Restituisce una riga di database (list), o None se l'indirizzo non
43 # è stato trovato
44 dati = {}
45 response = requests.get(page_url)
46 if response.status_code != 200:
47 dati = {}
48 print(str(datetime.now()) + ' -- Non ho trovato l\'annuncio {}.\nForse nel frattempo e` stato rimosso.'.format(page_url))
49 else:
50 page_soup = BeautifulSoup(response.text, "html.parser")
51 raw_list = page_soup.find(id="js-hydration")
52 try:
53 json_dict = json.loads(raw_list.text)
54 except AttributeError:
55 json_dict = {}
56 try:
57 listing = json_dict['listing']
58 if listing is None:
59 listing = {}
60 except (KeyError, ValueError, IndexError):
61 listing = {}
62 temp = flatten_dict(listing)
63 listing = {up_to_underscore(k):temp[k] for k in temp}
64 try:
65 properties = listing['properties'][0]
66 if properties is None:
67 properties = {}
68 except (KeyError, ValueError, IndexError):
69 properties = {}
70 temp = flatten_dict(properties)
71 properties = {up_to_underscore(k):temp[k] for k in temp}
72 values = {**listing, **properties}
73 dati = {k : values[k] if k in values.keys() else None for k in colonne_utili }
74 try:
75 dati['created_at'] = datetime.fromtimestamp(dati['created_at'])
76 dati['updated_at'] = datetime.fromtimestamp(dati['updated_at'])
77 except TypeError:
78 pass
79 for k in dati:
80 if isinstance(dati[k], str):
81 try:
82 dati[k] = re.sub(r'\s',' ', re.sub(r'[^\w\s]', ' ', dati[k]).lower())
83 except TypeError:
84 pass
85 return dati
86
87 colonne_utili = {'uuid':str, 'id':int, 'type':str, 'created_at':datetime,
88 'updated_at':datetime, 'title':str, 'caption':str,
89 'guaranteed':str, 'reference_code':str, 'contract_type':str, 'contract_name':str,
90 'advertiser_type':str, 'advertiser_agency_id':int, 'advertiser_agency_type':str,
91 'advertiser_agency_display_name':str, 'advertiser_agency_typology_id':int,
92 'advertiser_agency_typology_name':str, 'advertiser_agency_sub_typology_id':int,
93 'advertiser_agency_sub_typology_name':str,
94 # dati da 'properties':
95 'main': bool, 'costs_price': int, 'costs_currency': str,
96 'costs_condominium': int, 'costs_expenses': int, 'costs_heating': int,
97 'costs_visible': bool, 'costs_lowered_price_original_price': str,
98 'costs_lowered_price_current_price': str,'location_latitude': float,
99 'location_longitude': float, 'location_nation_name': str, 'location_region_name': str,
100 'location_province_name': str, 'location_city_id': str, 'location_city_name': str,
101 'location_address': str, 'location_visible': bool, 'rooms_number': str,
102 'rooms_has_more': bool, 'num_other_rooms': int, 'num_bedrooms': int,
103 'surface': str, 'bathrooms_number': int, 'bathrooms_has_more': bool,
104 'elevator': bool, 'floor_id': int, 'floor_name': str, 'floor_type': str,
105 'floor_value': str, 'floors': int, 'garage': str, 'description': str,
106 'description_alternate': str, 'building_year': int, 'condition_id': str,
107 'condition_name': str, 'feature_values': list, 'category_id': int,
108 'category_name': str, 'typology_id': int, 'typology_name': str,
109 'typology_v2_id': int, 'typology_v2_name': str}
110
111 print('Estraggo la lista degli annunci...')
112 print('Sono a pagina ', end='')
113 p = 0
114 lista_url = []
115 while True:
116 p += 1
117 print(str(p) + ' ', end='')
118 url = 'https://www.immobiliare.it/ricerca.php?idCategoria=1&idContratto=1&idNazione=IT&criterio=dataModifica&ordine=desc&pag={}&vrt=45.545793%2C10.812263%3B45.527998%2C10.801277%3B45.502978%2C10.807457%3B45.499369%2C10.796471%3B45.501775%2C10.762482%3B45.500091%2C10.728493%3B45.540022%2C10.71785%3B45.572476%2C10.700684%3B45.568871%2C10.663605%3B45.597224%2C10.681801%3B45.611156%2C10.725403%3B45.606592%2C10.763512%3B45.605391%2C10.780678%3B45.594342%2C10.806427%3B45.584972%2C10.817413%3B45.564064%2C10.81501%3B45.555891%2C10.823936%3B45.545793%2C10.812263'.format(str(p))
119 time.sleep(1) # wait one second (just to be sure)
120 response = requests.get(url)
121 if response.status_code != 200:
122 break
123 else:
124 soup = BeautifulSoup(response.text, "html.parser")
125 href_label = re.compile('https://www\.immobiliare\.it/annunci/(.*)/')
126 lista_url += [x.get('href') for x in soup.findAll('a', {"href":href_label}) if not(x.get('href') in lista_url)]
127 lista_pk = [int(re.search(href_label, x).group(1)) for x in lista_url]
128 print()
129 print(' Fatto!')
130
131 # connect to postgres DB
132 print('Connetto al database remoto')
133 DATABASE_URL = os.environ['DATABASE_URL']
134 conn = psycopg2.connect(DATABASE_URL, sslmode='require')
135 cur = conn.cursor()
136 # check if table exists
137 table_name = 'immobili'
138 cur.execute(f'SELECT to_regclass(\'public.{table_name}\')')
139 a = cur.fetchone()[0]
140 table_exists = (a is not None)
141 print('Table exists: ' + str(table_exists))
142
143 # Create query
144 s = """, """.join((k + """ """ + tipo_sql[colonne_utili[k]] for k in colonne_utili))
145 create_table = f"""CREATE TABLE {table_name} ({s})"""
146 set_pk = f"""ALTER TABLE public.{table_name} ADD PRIMARY KEY (id)"""
147
148 if table_exists:
149 cur.execute(f'SELECT (id) FROM public.{table_name}')
150 ids = [a[0] for a in cur.fetchall()]
151 print('Ho raccolto gli id gia presenti nella tabella.')
152 else:
153 ids = []
154 cur.execute(create_table)
155 cur.execute(set_pk)
156 print('La tabella non esisteva nel database. L\'ho creata.')
157
158 conteggio_aggiunte = 0
159 for pk in lista_pk:
160 if not (pk in ids):
161 page_url = f'https://www.immobiliare.it/annunci/{pk}/'
162 dati = estrai_dati(page_url,colonne_utili)
163 # Insert query
164 formatted_values = {x : '\'' + dati[x] + '\'' if isinstance(dati[x],str) else
165 str(dati[x]) if isinstance(dati[x],int) or isinstance(dati[x],float) else
166 '\'' + dati[x].strftime('%Y-%m-%d %H:%M:%S') + '\'' if isinstance(dati[x],datetime) else
167 'ARRAY {}::text[]'.format(str(dati[x])) if isinstance(dati[x], list) else
168 'NULL' for x in dati}
169 columns = """, """.join (formatted_values.keys())
170 values = """, """.join (formatted_values.values())
171 insert_query = f"""INSERT INTO public.{table_name} ({columns}) VALUES ({values})"""
172 cur.execute(insert_query)
173 conteggio_aggiunte += 1
174 print(f'{conteggio_aggiunte}. Aggiunto al database remoto l\'annuncio {pk}')
175
176 print(f'Finito! Ho aggiunto {conteggio_aggiunte} annunci.')
177 cur.close()
178 conn.commit()
179 conn.close()
180
181if __name__ == "__main__":
182 port = int(os.environ.get("PORT", 5000))
183 print('Sono partito!')
184 RicercaImmobili.run(host='0.0.0.0', port=port)