Web Scraping REST API with Node, Express and Puppeteer
Type: npm init and press enter
Answer or leave unanswered the questions asked by the program.
Your Node Project is ready.
Run npm install --save puppeteer in the terminal.
This installs puppeteer as well as an instance of browser.
Add the following lines to app.js
Add a folder named utils
Add a file in utils named scraper.js
Add the following code in scraper.js
Run node app.js in terminal to start the server on localhost.
In browser open the following URL to test your Scraping API: http://localhost:3000/reviews/?url=http://www.tigerdirect.com/applications/SearchTools/item-details.asp?EdpNo=3415697
Tadaaaaa!
If it did not work. Let me know in the comments.
Step 1. Create a Node project
Create a folder. Open the folder using a terminal.Type: npm init and press enter
Answer or leave unanswered the questions asked by the program.
Your Node Project is ready.
Step 2. Install Puppeteer and Express
Run npm install --save express in the terminal.Run npm install --save puppeteer in the terminal.
This installs puppeteer as well as an instance of browser.
Step 3. Create Web Scraping Program
Create a file named app.jsAdd the following lines to app.js
const express = require('express')
const scraper = require('./utils/scraper')
const app = express();
app.get('/reviews', (req, res) => {
scraper.extractReviews(req.query.url)
.then(data => {
res.status(200).json({ message: "success", data: data })
}).catch(err => res.status(500).json({ message: "Something went wrong. Could not fetch result." }))
});
app.listen(process.env.PORT || 3000, () =>
console.log('Example app listening on port!'),
);
Add a file in utils named scraper.js
Add the following code in scraper.js
const puppeteer = require('puppeteer'); // import puppeteer
extractReviews = async (url) => {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url, {waitUntil: 'networkidle2'});
const reviewCount = await page.evaluate(() => document.querySelector('span[itemprop="reviewCount"]').getAttribute('content'));
let reviewArray = [];
if (reviewCount > 0) {
url = url+"&pagenumber=0&RSort=1&csid=ITD&recordsPerPage="+reviewCount+"&body=REVIEWS#CustomerReviewsBlock"
await page.goto(url, { waitUntil: 'load' });
reviewArray = await page.evaluate(() => Array.from(document.querySelectorAll('.review')).map(review => ({ reviewTitle: review.querySelector('.rightCol blockquote h6').textContent, reviewComment: review.querySelector('.rightCol blockquote p').textContent, reviewRating: +review.querySelector('.leftCol .itemReview dd .itemRating strong').textContent, reviewDate: review.querySelector('.leftCol .reviewer dd:nth-of-type(2)').textContent, reviewer: review.querySelector('.leftCol .reviewer dd:nth-of-type(1)').textContent })));
}
await browser.close();
return { reviewCount: +reviewCount, reviewArray: reviewArray, url: url };
};
module.exports.extractReviews = extractReviews
In browser open the following URL to test your Scraping API: http://localhost:3000/reviews/?url=http://www.tigerdirect.com/applications/SearchTools/item-details.asp?EdpNo=3415697
Tadaaaaa!
If it did not work. Let me know in the comments.
Comments
Post a Comment