Unable to implement any logic to scrape content from innermost pages using puppeteer

I’ve created a script using puppeteer to scrape the links of different authors from a webpage traversing multiple pages triggering click on the next page button. The script appears to be working in the right way.

Although the content of this site is static, I intentionally used puppeteer within the following script only to learn as to how I can parse content from inner pages.

Given that I wish to go one layer deep to scrape description from such pages. How can I achieve that?

const puppeteer = require('puppeteer');

function run (pagesToScrape) {
    return new Promise(async (resolve, reject) => {
        try {
            if (!pagesToScrape) {
                pagesToScrape = 1;
            }
            const browser = await puppeteer.launch({headless:false});
            const [page] = await browser.pages();
            await page.goto("https://quotes.toscrape.com/");
            let currentPage = 1;
            let urls = [];
            while (currentPage <= pagesToScrape) {
                let newUrls = await page.evaluate(() => {
                    let results = [];
                    let items = document.querySelectorAll('[class="quote"]');
                    items.forEach((item) => {
                        results.push({
                            authorUrl:  'https://quotes.toscrape.com' + item.querySelector("small.author + a").getAttribute('href'),
                            title: item.querySelector("span.text").innerText
                        });
                    });
                    return results;
                });
                urls = urls.concat(newUrls);
                if (currentPage < pagesToScrape) {
                    await Promise.all([
                        await page.waitForSelector('li.next > a'),
                        await page.click('li.next > a'),
                        await page.waitForSelector('[class="quote"]')
                    ])
                }
                currentPage++;
            }
            browser.close();
            return resolve(urls);
        } catch (e) {
            return reject(e);
        }
    })
}
run(3).then(console.log).catch(console.error);

Answer

I would go this way:

const puppeteer = require('puppeteer');

let browser;

(async function main() {
  browser = await puppeteer.launch({ headless: false, defaultViewport: null });

  const [pageQuotes] = await browser.pages();
  const pageAbout = await browser.newPage();
  await pageQuotes.bringToFront(); // Otherwise, click on the next page link does not work.

  const pagesToScrape = 3;

  await pageQuotes.goto('https://quotes.toscrape.com/');
  let currentPage = 1;

  const data = { quotes: {}, abouts: {} };
  const visitedAbouts = new Set();

  while (currentPage <= pagesToScrape) {
    await pageQuotes.waitForSelector('.quote');

    const { quotes, aboutURLs } = await pageQuotes.evaluate(() => ({
      quotes: Array.from(
        document.querySelectorAll('.quote'),
        quote => [quote.querySelector('small.author').innerText, quote.innerText],
      ),
      aboutURLs: Array.from(
        document.querySelectorAll('.quote small.author + a[href]'),
        quote => quote.href,
      ),
    }));

    for (const [author, quote] of quotes) {
      if (data.quotes[author] === undefined) data.quotes[author] = [];
      data.quotes[author].push(quote);
    }

    for (const aboutURL of aboutURLs) {
      if (!visitedAbouts.has(aboutURL)) {
        visitedAbouts.add(aboutURL);

        await pageAbout.goto(aboutURL);
        await pageAbout.waitForSelector('div.author-details');

        const { title, about } = await pageAbout.evaluate(() => ({
          title: document.querySelector('div.author-details h3.author-title').innerText,
          about: document.querySelector('div.author-details').innerText,
        }));

        data.abouts[title] = about;
      }
    }

    if (currentPage < pagesToScrape) {
      const nextLink = await pageQuotes.waitForSelector('li.next > a');

      await Promise.all([
        nextLink.click(),
        pageQuotes.waitForNavigation(),
      ]);
    }
    currentPage++;
  }

  console.log(JSON.stringify(data, null, '  '));
})().catch(console.error).finally(async () => { if (browser) await browser.close(); });