· 4 years ago · Apr 28, 2021, 07:08 PM
1const fs = require("fs");
2const puppeteer = require("puppeteer"); //puppeteer, a headless chromium browser
3const os = require("os");
4
5/*
6Note: Puppeteer, being a headless browser, is pretty slow due to the usual overheads of a browser,
7 and I did not realise that until too late into the assignment. Probably will switch to scrapy
8 & BeautifulSoup with python next time
9*/
10
11(async () => {
12 // open browser
13 const browser = await puppeteer.launch();
14
15 //read the contents of the CSV file and split them on new line
16 let asins =
17 os.platform() === "win32"
18 ? fs.readFileSync("./asins.csv", { encoding: "utf-8" }).split("\r\n")
19 : fs.readFileSync("./asins.csv", { encoding: "utf-8" }).split("\n");
20 //remove title(first row)
21 asins = asins.slice(1, asins.length);
22
23 //create the output CSV File
24 fs.writeFileSync(
25 "./output.csv",
26 "ASIN,UPC,Name,Price,Category,Sub-Category,Images,Description,Rating,Number of Ratings,Best Seller Rank,Brand,Manufacturer,Weight,Dimensions\n"
27 );
28
29 //open a new page on the browser
30 const page = await browser.newPage();
31
32 //iterate over all the ASINS
33 for (let i = 0; i < asins.length; i++) {
34 let asin = asins[i];
35
36 //search query with ASIN
37 await page.goto(`https://amazon.in/s?k=${asin}`, {
38 waitUntil: "networkidle0",
39 timeout: 0,
40 });
41 console.log(`entered ${asin}`);
42
43 // page.on("console", (consoleObj) => console.log(consoleObj.text()));
44
45 /*
46 Note: page.evaluate() lets us run DOM queries and use querySelector functions like we are in the browser console on the website.
47 We return the desired output as an object.
48 */
49
50 //extract URL of the product as well as its categories
51 const url_and_category = await page.evaluate(
52 ({ asin }) => {
53 //extract URL
54 let aNode = document.querySelector(`div[data-asin='${asin}'] a[href]`); // the first a tag with the "data-asin" attribute
55 let url = aNode ? aNode.href : null;
56
57 // //extract category
58 // let category, subcategory;
59 // let ul = document.querySelector("ul");
60 // [category, subcategory] = ul // array destructuring
61 // ? Array.from(ul.querySelectorAll("li span.a-list-item")).map((span) => span.innerText) //convert ListNode to array, and select all li with a span of class "a-list-item" inside it
62 // : [undefined, undefined];
63
64 return { url: url }; //return the object to the url_and_category variable
65 },
66 { asin }
67 );
68
69 console.log(url_and_category);
70
71 console.log(
72 "____________________________________________________________________________"
73 );
74 console.log(`Product Number: ${i + 1}`);
75 console.log(
76 "‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾"
77 );
78
79 //if there was no URL extracted, means that the search query returned no result, which means no such product with ASIN exists
80 //then skip this iteration
81 if (!url_and_category.url) continue;
82
83 //load up the URL if it was found
84 await page.goto(url_and_category.url, {
85 waitUntil: "domcontentloaded",
86 timeout: 0,
87 });
88
89 let data = await page.evaluate(() => {
90 //get price
91 let priceNode = document.querySelector("span#priceblock_ourprice");
92 priceNode = priceNode
93 ? priceNode
94 : document.querySelector("span#priceblock_dealprice");
95 let price = priceNode
96 ? parseFloat(priceNode.innerText.split(/\s+/)[1].split(",").join("")) // parse it into float value to avoid commas
97 : "Not Given";
98
99 //get name
100 let nameNode = document.querySelector("span#productTitle");
101 let name = nameNode
102 ? nameNode.innerText.split(",").join(" ").split("\n").join(" ")
103 : "Not Found"; //split on newline or comma
104
105 //get description
106 let descriptionNode = document.querySelector("div#productDescription");
107 let description = descriptionNode
108 ? descriptionNode.innerText.split("\n").join(" ").split(",").join(" ") //split on newline or comma
109 : "";
110
111 //get rating
112 let ratingsNode = document.querySelector("span[data-hook*=rating]");
113 let ratings = ratingsNode
114 ? ratingsNode.innerText.split(" ")[0]
115 : "No ratings"; //eg: 3.6 out of 5 => split(" ") => [3.6, ....]
116
117 //get number of ratings
118 let num_ratingsNode = document.querySelector(
119 "div[data-hook='total-review-count'] span"
120 );
121 let num_ratings = num_ratingsNode
122 ? +num_ratingsNode.innerText.split(" ")[0].replace(",", "")
123 : 0; //convert 1,633 to 1633
124
125 //get brand
126 let brandNode = document.querySelector("a#bylineInfo");
127 let brandArray = brandNode ? brandNode.innerText.split(" ") : [];
128 let brand =
129 brandArray.length > 1
130 ? brandArray.slice(1, brandArray.length).join(" ")
131 : "Not Given";
132
133 //attempt to get manufacturer, dimensions, weight, best sellers rank
134 let manufacturer,
135 dimensions,
136 weight,
137 best_sellers_rank,
138 category,
139 sub_category,
140 upc; // declare the variables first
141
142 /*
143 NOTE: It can be seen that we try to extract category and sub-category here as well as the previous page.evaluate(),
144 That is because the best sellers rank in the extra info section often consists of the subcategory, which we can later
145 merge/replace/use instead of the category scraped from the search results page, since it is more accurate.
146 */
147
148 //get category from the navbar
149 category = document.querySelector("#nav-subnav a span")?.innerText;
150
151 //2 ways in which information is given. Either in the form of tables, or in the form of Unordered List
152 let trs = Array.from(
153 document.querySelectorAll("table[id*=productDetails] tr")
154 );
155 let lis = Array.from(
156 document.querySelectorAll("div#detailBullets_feature_div li")
157 );
158
159 //If information is given in the form of table, then gather the list of trs and iterate through them
160 if (lis.length == 0 && trs.length > 0) {
161 trs.forEach((tr) => {
162 //each tr in table consists of th and td, parse them
163 //th will be the key(manufacturer, dimensions, etc) and td will contain the value
164 th = tr.querySelector("th").innerText.toLowerCase(); // convert to lowercase in order to avoid case mismatch
165 td = tr.querySelector("td").innerText.toLowerCase();
166
167 //can extract the value depending on what the key(th) value contains as substring(ex: "Product Manufacturer".includes("manufacturer") = true)
168 if (th) {
169 if (
170 (th.includes("upc") || th.includes("item part number")) &&
171 (td.length === 12 || td.length === 13) &&
172 !isNaN(td)
173 )
174 upc = td;
175 else if (th.includes("manufacturer")) manufacturer = td;
176 else if (th.includes("dimension")) dimensions = td;
177 else if (th.includes("weight")) weight = td;
178 else if (th.includes("best sellers rank")) {
179 best_sellers_rank = td
180 .split("\n")
181 .join("AND ")
182 .replace(/\([^()]*\)/gim, "");
183 if (best_sellers_rank) {
184 //the rankings are divided by a newline, so split string by newline
185 [category, sub_category] = td.split("\n").map(
186 (sentence) =>
187 sentence
188 .slice(sentence.indexOf("in") + 3, sentence.length) // Eg: #250 in [category]
189 .replace(/\([^()]*\)/gim, "") //remove the content between the brackets(including the brackets)
190 .replace(",", "") //make sure to remove any extra commas
191 );
192 }
193 }
194 }
195 });
196 }
197 //If information is given in the form of ul, then parse the li's
198 else if (lis.length > 0) {
199 lis.forEach((li) => {
200 //text is in one span, with 2 spans in it, where the first span is the key and the second span is the value
201 //use array destructuring to extract them
202 let [key, val] = Array.from(
203 li.querySelectorAll("span > span")
204 ).map((s) => s.innerText.toLowerCase());
205
206 if (key) {
207 //in some cases, instead of the text being in 2 spans within one span, the text is one span directly, so we handle that case here
208 if (!val) val = li.querySelector("span").innerText.split(": ")[1];
209
210 if (
211 (key.includes("upc") || key.includes("item part number")) &&
212 (val.length === 12 || val.length === 13) &&
213 !isNaN(val)
214 )
215 upc = val;
216 else if (key.includes("manufacturer")) manufacturer = val;
217 else if (key.includes("dimension")) dimensions = val;
218 else if (key.includes("weight")) weight = val;
219 else if (key.includes("best sellers rank")) {
220 best_sellers_rank = val
221 .split("\n")
222 .join("AND ")
223 .replace(/\([^()]*\)/gim, "");
224 if (best_sellers_rank) {
225 //the rankings are divided by a newline, so split string by newline
226 [category, sub_category] = val.split("\n").map(
227 (sentence) =>
228 sentence
229 .slice(sentence.indexOf("in") + 3, sentence.length) // Eg: #250 in [category]
230 .replace(/\([^()]*\)/gim, "") //remove the content between the brackets(including the brackets)
231 .replace(",", "") //make sure to remove any extra commas
232 );
233 }
234 }
235 }
236 });
237 }
238
239 //get images
240 let images = Array.from(
241 document.querySelectorAll("div#altImages img")
242 ).reduce(
243 (prev, cur) =>
244 prev +
245 (!cur.src.includes(
246 "https://images-na.ssl-images-amazon.com/images/G/31/HomeCustomProduct/360_icon" //to remove the logo
247 )
248 ? cur.src.replace(/\._.*_/gim, "") + "\n" //regex is to normalise url
249 : ""),
250 ""
251 );
252
253 return {
254 name,
255 upc,
256 price,
257 description,
258 ratings,
259 num_ratings,
260 brand,
261 manufacturer,
262 dimensions,
263 weight,
264 best_sellers_rank,
265 category,
266 sub_category,
267 images,
268 };
269 });
270
271 //final set of variables to be added to the output,csv file
272 let name = data.name;
273 let upc = data.upc;
274 let price = data.price;
275 let category = data.category
276 ? data.category.toLowerCase().trim()
277 : url_and_category.category;
278 let sub_category = data.sub_category
279 ? data.sub_category.toLowerCase().trim()
280 : url_and_category.subcategory;
281 let images = data.images;
282 let description = data.description;
283 let ratings = data.ratings;
284 let num_ratings = data.num_ratings;
285 let brand = data.brand;
286 let best_sellers_rank = data.best_sellers_rank
287 ? data.best_sellers_rank
288 .split(",") //remove commas for accurate CSV insertion
289 .join("")
290 .trim()
291 .slice(0, -3) //Remove trailing "AND "
292 : "Not Given";
293 let manufacturer = data.manufacturer
294 ? data.manufacturer.replace(/,/gim, "")
295 : "Not Given"; // remove commas
296 let weight = data.weight ? data.weight.replace(/,/gim, "") : "Not Given"; // remove commas
297 let dimensions = data.dimensions
298 ? data.dimensions.replace(/,/gim, ";")
299 : "Not Given"; // remove commas
300
301 console.log({
302 name,
303 upc,
304 price,
305 category,
306 sub_category,
307 images,
308 description,
309 ratings,
310 num_ratings,
311 best_sellers_rank,
312 brand,
313 manufacturer,
314 weight,
315 dimensions,
316 });
317
318 // ASIN,Name,Price,Category,Sub-Category,Images,Description,Rating,Number of Ratings,Best Seller Rank,Brand,Manufacturer,Weight,Dimensions
319 //write scraped contents into the CSV File
320 fs.appendFileSync(
321 "./output.csv",
322 `${asin},${upc},${name},${price},${category},${sub_category},"${images}","${description}",${ratings},${num_ratings},${best_sellers_rank},${brand},${manufacturer},${weight},${dimensions}\n`
323 );
324 }
325
326 //close the browser
327 await browser.close();
328})();
329