gUsKx8Ln

· 4 years ago · Apr 28, 2021, 07:08 PM
1const fs = require("fs");
2const puppeteer = require("puppeteer"); //puppeteer, a headless chromium browser
3const os = require("os");
4
5/*
6Note: Puppeteer, being a headless browser, is pretty slow due to the usual overheads of a browser, 
7	and I did not realise that until too late into the assignment. Probably will switch to scrapy
8  & BeautifulSoup with python next time
9*/
10
11(async () => {
12  // open browser
13  const browser = await puppeteer.launch();
14
15  //read the contents of the CSV file and split them on new line
16  let asins =
17    os.platform() === "win32"
18      ? fs.readFileSync("./asins.csv", { encoding: "utf-8" }).split("\r\n")
19      : fs.readFileSync("./asins.csv", { encoding: "utf-8" }).split("\n");
20  //remove title(first row)
21  asins = asins.slice(1, asins.length);
22
23  //create the output CSV File
24  fs.writeFileSync(
25    "./output.csv",
26    "ASIN,UPC,Name,Price,Category,Sub-Category,Images,Description,Rating,Number of Ratings,Best Seller Rank,Brand,Manufacturer,Weight,Dimensions\n"
27  );
28
29  //open a new page on the browser
30  const page = await browser.newPage();
31
32  //iterate over all the ASINS
33  for (let i = 0; i < asins.length; i++) {
34    let asin = asins[i];
35
36    //search query with ASIN
37    await page.goto(`https://amazon.in/s?k=${asin}`, {
38      waitUntil: "networkidle0",
39      timeout: 0,
40    });
41    console.log(`entered ${asin}`);
42
43    // page.on("console", (consoleObj) => console.log(consoleObj.text()));
44
45    /*
46		Note: page.evaluate() lets us run DOM queries and use querySelector functions like we are in the browser console on the website.
47		We return the desired output as an object.
48		*/
49
50    //extract URL of the product as well as its categories
51    const url_and_category = await page.evaluate(
52      ({ asin }) => {
53        //extract URL
54        let aNode = document.querySelector(`div[data-asin='${asin}']  a[href]`); // the first a tag with the "data-asin" attribute
55        let url = aNode ? aNode.href : null;
56
57        // //extract category
58        // let category, subcategory;
59        // let ul = document.querySelector("ul");
60        // [category, subcategory] = ul // array destructuring
61        // 	? Array.from(ul.querySelectorAll("li span.a-list-item")).map((span) => span.innerText) //convert ListNode to array, and select all li with a span of class "a-list-item" inside it
62        // 	: [undefined, undefined];
63
64        return { url: url }; //return the object to the url_and_category variable
65      },
66      { asin }
67    );
68
69    console.log(url_and_category);
70
71    console.log(
72      "____________________________________________________________________________"
73    );
74    console.log(`Product Number: ${i + 1}`);
75    console.log(
76      "‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾"
77    );
78
79    //if there was no URL extracted, means that the search query returned no result, which means no such product with ASIN exists
80    //then skip this iteration
81    if (!url_and_category.url) continue;
82
83    //load up the URL if it was found
84    await page.goto(url_and_category.url, {
85      waitUntil: "domcontentloaded",
86      timeout: 0,
87    });
88
89    let data = await page.evaluate(() => {
90      //get price
91      let priceNode = document.querySelector("span#priceblock_ourprice");
92      priceNode = priceNode
93        ? priceNode
94        : document.querySelector("span#priceblock_dealprice");
95      let price = priceNode
96        ? parseFloat(priceNode.innerText.split(/\s+/)[1].split(",").join("")) // parse it into float value to avoid commas
97        : "Not Given";
98
99      //get name
100      let nameNode = document.querySelector("span#productTitle");
101      let name = nameNode
102        ? nameNode.innerText.split(",").join(" ").split("\n").join(" ")
103        : "Not Found"; //split on newline or comma
104
105      //get description
106      let descriptionNode = document.querySelector("div#productDescription");
107      let description = descriptionNode
108        ? descriptionNode.innerText.split("\n").join(" ").split(",").join(" ") //split on newline or comma
109        : "";
110
111      //get rating
112      let ratingsNode = document.querySelector("span[data-hook*=rating]");
113      let ratings = ratingsNode
114        ? ratingsNode.innerText.split(" ")[0]
115        : "No ratings"; //eg: 3.6 out of 5 => split(" ") => [3.6, ....]
116
117      //get number of ratings
118      let num_ratingsNode = document.querySelector(
119        "div[data-hook='total-review-count'] span"
120      );
121      let num_ratings = num_ratingsNode
122        ? +num_ratingsNode.innerText.split(" ")[0].replace(",", "")
123        : 0; //convert 1,633 to 1633
124
125      //get brand
126      let brandNode = document.querySelector("a#bylineInfo");
127      let brandArray = brandNode ? brandNode.innerText.split(" ") : [];
128      let brand =
129        brandArray.length > 1
130          ? brandArray.slice(1, brandArray.length).join(" ")
131          : "Not Given";
132
133      //attempt to get manufacturer, dimensions, weight, best sellers rank
134      let manufacturer,
135        dimensions,
136        weight,
137        best_sellers_rank,
138        category,
139        sub_category,
140        upc; // declare the variables first
141
142      /*
143			NOTE: It can be seen that we try to extract category and sub-category here as well as the previous page.evaluate(),
144			That is because the best sellers rank in the extra info section often consists of the subcategory, which we can later
145			merge/replace/use instead of the category scraped from the search results page, since it is more accurate.
146			*/
147
148      //get category from the navbar
149      category = document.querySelector("#nav-subnav a span")?.innerText;
150
151      //2 ways in which information is given. Either in the form of tables, or in the form of Unordered List
152      let trs = Array.from(
153        document.querySelectorAll("table[id*=productDetails] tr")
154      );
155      let lis = Array.from(
156        document.querySelectorAll("div#detailBullets_feature_div li")
157      );
158
159      //If information is given in the form of table, then gather the list of trs and iterate through them
160      if (lis.length == 0 && trs.length > 0) {
161        trs.forEach((tr) => {
162          //each tr in table consists of th and td, parse them
163          //th will be the key(manufacturer, dimensions, etc) and td will contain the value
164          th = tr.querySelector("th").innerText.toLowerCase(); // convert to lowercase in order to avoid case mismatch
165          td = tr.querySelector("td").innerText.toLowerCase();
166
167          //can extract the value depending on what the key(th) value contains as substring(ex: "Product Manufacturer".includes("manufacturer") = true)
168          if (th) {
169            if (
170              (th.includes("upc") || th.includes("item part number")) &&
171              (td.length === 12 || td.length === 13) &&
172              !isNaN(td)
173            )
174              upc = td;
175            else if (th.includes("manufacturer")) manufacturer = td;
176            else if (th.includes("dimension")) dimensions = td;
177            else if (th.includes("weight")) weight = td;
178            else if (th.includes("best sellers rank")) {
179              best_sellers_rank = td
180                .split("\n")
181                .join("AND ")
182                .replace(/\([^()]*\)/gim, "");
183              if (best_sellers_rank) {
184                //the rankings are divided by a newline, so split string by newline
185                [category, sub_category] = td.split("\n").map(
186                  (sentence) =>
187                    sentence
188                      .slice(sentence.indexOf("in") + 3, sentence.length) // Eg: #250 in [category]
189                      .replace(/\([^()]*\)/gim, "") //remove the content between the brackets(including the brackets)
190                      .replace(",", "") //make sure to remove any extra commas
191                );
192              }
193            }
194          }
195        });
196      }
197      //If information is given in the form of ul, then parse the li's
198      else if (lis.length > 0) {
199        lis.forEach((li) => {
200          //text is in one span, with 2 spans in it, where the first span is the key and the second span is the value
201          //use array destructuring to extract them
202          let [key, val] = Array.from(
203            li.querySelectorAll("span > span")
204          ).map((s) => s.innerText.toLowerCase());
205
206          if (key) {
207            //in some cases, instead of the text being in 2 spans within one span, the text is one span directly, so we handle that case here
208            if (!val) val = li.querySelector("span").innerText.split(": ")[1];
209
210            if (
211              (key.includes("upc") || key.includes("item part number")) &&
212              (val.length === 12 || val.length === 13) &&
213              !isNaN(val)
214            )
215              upc = val;
216            else if (key.includes("manufacturer")) manufacturer = val;
217            else if (key.includes("dimension")) dimensions = val;
218            else if (key.includes("weight")) weight = val;
219            else if (key.includes("best sellers rank")) {
220              best_sellers_rank = val
221                .split("\n")
222                .join("AND ")
223                .replace(/\([^()]*\)/gim, "");
224              if (best_sellers_rank) {
225                //the rankings are divided by a newline, so split string by newline
226                [category, sub_category] = val.split("\n").map(
227                  (sentence) =>
228                    sentence
229                      .slice(sentence.indexOf("in") + 3, sentence.length) // Eg: #250 in [category]
230                      .replace(/\([^()]*\)/gim, "") //remove the content between the brackets(including the brackets)
231                      .replace(",", "") //make sure to remove any extra commas
232                );
233              }
234            }
235          }
236        });
237      }
238
239      //get images
240      let images = Array.from(
241        document.querySelectorAll("div#altImages img")
242      ).reduce(
243        (prev, cur) =>
244          prev +
245          (!cur.src.includes(
246            "https://images-na.ssl-images-amazon.com/images/G/31/HomeCustomProduct/360_icon" //to remove the logo
247          )
248            ? cur.src.replace(/\._.*_/gim, "") + "\n" //regex is to normalise url
249            : ""),
250        ""
251      );
252
253      return {
254        name,
255        upc,
256        price,
257        description,
258        ratings,
259        num_ratings,
260        brand,
261        manufacturer,
262        dimensions,
263        weight,
264        best_sellers_rank,
265        category,
266        sub_category,
267        images,
268      };
269    });
270
271    //final set of variables to be added to the output,csv file
272    let name = data.name;
273    let upc = data.upc;
274    let price = data.price;
275    let category = data.category
276      ? data.category.toLowerCase().trim()
277      : url_and_category.category;
278    let sub_category = data.sub_category
279      ? data.sub_category.toLowerCase().trim()
280      : url_and_category.subcategory;
281    let images = data.images;
282    let description = data.description;
283    let ratings = data.ratings;
284    let num_ratings = data.num_ratings;
285    let brand = data.brand;
286    let best_sellers_rank = data.best_sellers_rank
287      ? data.best_sellers_rank
288          .split(",") //remove commas for accurate CSV insertion
289          .join("")
290          .trim()
291          .slice(0, -3) //Remove trailing "AND "
292      : "Not Given";
293    let manufacturer = data.manufacturer
294      ? data.manufacturer.replace(/,/gim, "")
295      : "Not Given"; // remove commas
296    let weight = data.weight ? data.weight.replace(/,/gim, "") : "Not Given"; // remove commas
297    let dimensions = data.dimensions
298      ? data.dimensions.replace(/,/gim, ";")
299      : "Not Given"; // remove commas
300
301    console.log({
302      name,
303      upc,
304      price,
305      category,
306      sub_category,
307      images,
308      description,
309      ratings,
310      num_ratings,
311      best_sellers_rank,
312      brand,
313      manufacturer,
314      weight,
315      dimensions,
316    });
317
318    // ASIN,Name,Price,Category,Sub-Category,Images,Description,Rating,Number of Ratings,Best Seller Rank,Brand,Manufacturer,Weight,Dimensions
319    //write scraped contents into the CSV File
320    fs.appendFileSync(
321      "./output.csv",
322      `${asin},${upc},${name},${price},${category},${sub_category},"${images}","${description}",${ratings},${num_ratings},${best_sellers_rank},${brand},${manufacturer},${weight},${dimensions}\n`
323    );
324  }
325
326  //close the browser
327  await browser.close();
328})();
329