· 5 years ago · Feb 20, 2020, 06:22 PM
1package thepioneer;
2
3import java.io.IOException;
4import java.sql.Connection;
5import java.sql.DriverManager;
6import java.sql.SQLException;
7import java.sql.Statement;
8import java.util.ArrayList;
9
10import org.jsoup.Jsoup;
11import org.jsoup.nodes.Document;
12import org.jsoup.nodes.Element;
13import org.jsoup.select.Elements;
14
15public class Scraper {
16
17 // true = uploads to database
18 // false = Creates files in directory
19 static boolean upload_mysql = true;
20 static int startYear = 2011;
21 static int endYear = 2020;
22 static int startMonth = 1;
23 static int endMonth = 12;
24 // DO NOT CHANGE
25 static Connection conn = null;
26 static Statement stmt = null;
27 static int lastPage;
28
29 // Information for MYSQL Server
30 // DO NOT CHANGE
31 static final String JDBC_DRIVER = "com.mysql.jdbc.Driver";
32 // CHANGE ME
33 static final String ip = "localhost";
34 static final String port = "3306";
35 static final String database_name = "thepioneer";
36 static final String username = "root";
37 static final String password = "";
38
39 public static void main(String[] args) throws SQLException {
40
41 try {
42 scrape();
43 } catch (IOException e) {
44 e.printStackTrace();
45 }
46
47 }
48
49 public static void scrape() throws IOException, SQLException {
50 ArrayList<String> al = new ArrayList<String>();
51
52 int counter = 1;
53
54 // int startYear = sY;
55 // int endYear = eY;
56
57 if (upload_mysql) {
58 createDatabase();
59 createTables();
60 }
61
62 for (int i = startYear; i <= endYear; i++) {
63 for (int j = startMonth; j <= endMonth; j++) {
64 Document doc = Jsoup
65 .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=").get();
66 try {
67 Element last = doc.select("div.pagingList").select("ul").select("li").last();
68 Elements lastPg = last.select("a");
69
70 // System.out.println(last);
71
72 String lastInt = lastPg.attr("id");
73
74 setLastPage(Integer.parseInt(lastInt));
75 } catch (Exception e) {
76
77 }
78
79
80 for (int k = 1; k < lastPage; k++) {
81
82 System.out.println("Year: " + i + " Month: " + j + " Pages: " + k);
83
84 // System.out.println(lastPageNum);
85
86 // System.out.println("Year: " + i + " Month: " + j);
87 // System.out.println("https://www.dailypioneer.com/searchlist.php?yr=" + i +
88 // "&mn=" + j + "&page=");
89
90 doc = Jsoup
91 .connect("https://www.dailypioneer.com/searchlist.php?yr=" + i + "&mn=" + j + "&page=" + k)
92 .get();
93
94 Elements highLightedNews = doc.select("div.highLightedNews").select("ul.list-unstyled").select("li")
95 .select("a");
96 Elements innerNewsList = doc.select("div.innerNewsList").select("a");
97
98 for (Element e : highLightedNews) {
99 if (!e.attr("abs:href").contains("author"))
100 al.add(e.attr("abs:href"));
101 }
102
103 for (Element e : innerNewsList) {
104 if (!e.attr("abs:href").contains("author"))
105 al.add(e.attr("abs:href"));
106 }
107
108 System.out.println(al.size());
109
110 for (String s : al) {
111 System.out.println(s);
112 doc = Jsoup.connect(s).get();
113 //String body = doc.select("div.newsDetailedContent").text();
114 //System.out.println(body);
115
116 if (upload_mysql == true) {
117
118 // Uploads articles to database
119 Connection myConn = DriverManager
120 .getConnection("jdbc:mysql://localhost:3306/" + database_name, username, password);
121
122 Statement myStmt = myConn.createStatement();
123
124 String newTitle = doc.title().replace("'", "''");
125
126 String newURL = s.replace("'", "''");
127
128 String site = "dailypioneer.com";
129
130 String newsInfo = doc.select("div.newsInfo").first().text();
131
132 String author = newsInfo.substring(newsInfo.indexOf('|') + 2, newsInfo.length());
133 String newAuthor = author.replace("'", "''");
134
135 String date = newsInfo.substring(0, newsInfo.indexOf('|'));
136 String newDate = date.replace("'", "''");
137
138 // String country = "India";
139
140 // String newCity = city.replace("'", "''");
141
142 String body = doc.select("div.newsDetailedContent").text();
143 //System.out.println(body);
144 String newContent = body.replace("'", "''");
145
146 String sql = "insert into `" + i + "` "
147 + " (id, Title, URL, Site, Author, Date, Content)" + " values ('"
148 + counter + "', '" + newTitle + "', '" + newURL + "', '" + site + "', '" + newAuthor
149 + "', '" + newDate + "', '" + newContent + "')";
150
151 try {
152 myStmt.executeUpdate(sql);
153 } catch (SQLException sqlEx) {
154 System.out.println("Error uploading article: " + counter);
155 } finally{
156 /*This block should be added to your code
157 * You need to release the resources like connections
158 */
159 if(conn!=null)
160 conn.close();
161 }
162
163 System.out.println("Article " + counter + " Succesfully Uploaded!");
164 counter++;
165 }
166 }
167
168 al.clear();
169
170 }
171 }
172 }
173 }
174
175 public static void setLastPage(int x) {
176 lastPage = x;
177 }
178
179 /**
180 * Function: Creates database
181 **/
182 public static void createDatabase() {
183 String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/";
184
185 try {
186 // STEP 2: Register JDBC driver
187 Class.forName("com.mysql.jdbc.Driver");
188
189 // STEP 3: Open a connection
190 System.out.println("Connecting to database...");
191 conn = DriverManager.getConnection(DB_URL, username, password);
192
193 // STEP 4: Execute a query
194 System.out.println("Creating database...");
195 stmt = conn.createStatement();
196
197 String sql = "CREATE DATABASE IF NOT EXISTS " + database_name.toUpperCase();
198 stmt.executeUpdate(sql);
199 System.out.println("Database created successfully...");
200 } catch (SQLException se) {
201 // Handle errors for JDBC
202 se.printStackTrace();
203 } catch (Exception ex) {
204 // Handle errors for Class.forName
205 ex.printStackTrace();
206 }
207 }
208
209 /**
210 * Function: Creates tables in database
211 **/
212 /**
213 * Function: Creates tables in database
214 **/
215 public static void createTables() {
216
217 String DB_URL = "jdbc:mysql://" + ip + ":" + port + "/" + database_name;
218
219 try {
220 // STEP 2: Register JDBC driver
221 Class.forName("com.mysql.jdbc.Driver");
222
223 // STEP 3: Open a connection
224 System.out.println("Connecting to database...");
225 conn = DriverManager.getConnection(DB_URL, username, password);
226
227 for (int i = startYear; i <= endYear; i++) {
228
229 // STEP 4: Execute a query
230 System.out.println("Creating table in given database...");
231 stmt = conn.createStatement();
232
233 String sql = "CREATE TABLE IF NOT EXISTS `" + Integer.toString(i).toUpperCase() + "` "
234 + "(`id` int(11) DEFAULT NULL," + "`Title` longtext DEFAULT NULL,"
235 + "`URL` longtext DEFAULT NULL," + "`Site` text DEFAULT NULL," + "`Author` text DEFAULT NULL,"
236 + "`Date` text DEFAULT NULL," + "`Content` longtext DEFAULT NULL)";
237
238 stmt.executeUpdate(sql);
239 System.out.println("Created table in given database...");
240
241 }
242 } catch (SQLException se) {
243 // Handle errors for JDBC
244 se.printStackTrace();
245 } catch (Exception ex) {
246 // Handle errors for Class.forName
247 ex.printStackTrace();
248 }
249
250 }
251
252}