Setup
Create NodeJS project
npm init yarn add puppeteer jsdom xpath
Code template
let fs = require("fs"); let puppeteer = require('puppeteer'); const jsdom = require("jsdom"); const xpath = require('xpath'); const { JSDOM } = jsdom; const path = require('path'); //////////////////////////////////// // Config constants const openURL = "https://google.com"; const exportFilePath = "output/main.html"; //////////////////////////////////////////// // Scrape HTML structure (async () => { // Initialize let browser = await puppeteer.launch({ headless: false); let page = await browser.newPage(); // Open URL console.log(openURL); await page.goto(openURL, {waitUntil: 'networkidle0'}); // or use this if above code didn't wait until page is fully loaded // await page.goto(openURL, {waitUntil: 'networkidle2', timeout: 9900000}); // Save to html file let html = await page.evaluate('new XMLSerializer().serializeToString(document.doctype) + document.documentElement.outerHTML'); /********************** // Save HTML file fs.writeFileSync(filePath, html, function(err){ console.log('html file successfully written!'); }); console.log('html file successfully written!'); // Read html file var absFilePath = path.join(__dirname, filePath); */ try { /***************************** // Read HTML file html = fs.readFileSync(absFilePath, 'utf8'); */ // Read DOM const dom = new JSDOM(html); const document = dom.window.document; // Work with DOM // ... // Close the browser await browser.close(); })();
How to open Chrome app, not Chromium?
- Open Terminal
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --no-first-run --no-default-browser-check --user-data-dir=$(mktemp -d -t 'chrome-remote_data_dir')
- You will see Chrome is opened, and there is line in Terminal like this
DevTools listening on ws://127.0.0.1:9222/devtools/browser/a5541788-9d01-42f2-ac0b-f86cb5d3aecf - Update JS code as below
const wsChromeEndpointurl = 'ws://127.0.0.1:9222/devtools/browser/a5541788-9d01-42f2-ac0b-f86cb5d3aecf'; const browser = await puppeteer.connect({ browserWSEndpoint: wsChromeEndpointurl, }); let page = await browser.newPage();
- Avoid calling browser.close();, if not you have to run step 1 again
Ref: Connect to existing Chrome
How to run multiple instances at the same time?
Use puppeteer-cluster
const { Cluster } = require('puppeteer-cluster'); (async () => { const cluster = await Cluster.launch({ concurrency: Cluster.CONCURRENCY_CONTEXT, maxConcurrency: 2, puppeteerOptions: {headless: false} }); await cluster.task(async ({ page, data: url }) => { // Open URL await page.goto(openURL); // Do something ... }); cluster.queue('http://google.com'); cluster.queue('https://stackoverflow.com/'); await cluster.idle(); await cluster.close(); })();
However there is no feature to set puppeteer.connect(), so it’s impossible to open Chrome app but Chromium
How to debug puppeteer?
- Add debugger and console.log into your codes
- Launch puppeteer with this option
puppeteer.launch({devtools: true})
- Then, browser will stops, you can then check console log
How to click a button?
HTML codes
<a role="button" id="button_id">Click Me Now</a>
Puppeteer codes
await page.waitFor('a[id=button_id]'); await page.$eval( 'a#button_id', form => form.click() );
How to click on an element?
// Quickest way // <li class='react-tabs_tab'>AAA</li> // <li class='react-tabs_tab'>BBB</li> // <li class='react-tabs_tab'>CCC</li> await page.$$eval('li.react-tabs__tab', (elHandles) => elHandles.every((el) => { if (el?.textContent === 'BBB') { el.click() return false } return true }) )
How to fill an input field?
await page.$eval('input[name=search]', el => el.value = 'ABCDEFGH');
or
await page.type('input[name=search]]', 'ABCDEFGH', {delay: 20})
How to sleep for a certain of time?
// Option 1 - resolving a promise when `setTimeout` finishes const sleep = duration => new Promise(resolve => setTimeout(resolve, duration)); await sleep(3000); // Option 2 - if we have a page instance, just using `waitFor` await page.waitFor(3000);
How to emulate iPhone X?
let page = await browser.newPage(); // Emulates an iPhone X await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'); await page.setViewport({ width: 375, height: 812 });
How to go back previous page?
await page.goBack();
How to work with DOM?
How to select elements by XPath?
Select elements containing text
// nodes having promotion keyword const elements = xpath.select(`//*[text()[contains(., 'ABCDEF')]]`, document); // get first node const oneElement = elements[0]; if (oneElement) { ..... }
Parent, sibling nodes
const nextSibling = oneElement.nextSibling; const parent = oneElement.parentNode;
Get attribute value
const href = oneElement.getAttribute('href')
Locate element by XPath
//div[@id="top-list"]/div[@data-list]
//div[@id="top-list"]/div[@data-list="1"]
Locate element by class name
/* <div class="Test"> */ //div[@class="Test"]
/* <div class="Test some-other-class"> */ //div[contains(concat(' ', normalize-space(@class), ' '), ' Test ')]
/* <div class="Test some-other-class"> */ //div[contains(concat(' ', @class, ' '), ' Test ')]
Check if element is visible
export async function isVisible(page, xPathSelector) { try { await page.waitForXPath(xPathSelector, { visible: true, timeout: 1000 }) return true } catch { return false } }
Leave a Reply