Crawl a sitemap
This example downloads and crawls the URLs from a sitemap.
Using CheerioCrawler:
const Apify = require('apify');
Apify.main(async () => {
    // Add URLs to a RequestList from a sitemap
    const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
    const requestList = await Apify.openRequestList('start-urls', sources);
    // Function called for each URL
    const handlePageFunction = async ({ request }) => {
        console.log(request.url);
    };
    // Create a crawler that uses Cheerio
    const crawler = new Apify.CheerioCrawler({
        requestList,
        handlePageFunction,
        maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
    });
    // Run the crawler
    await crawler.run();
});
Using PuppeteerCrawler:
To run this example on the Apify Platform, select the
apify/actor-node-puppeteer-chromeimage for your Dockerfile.
const Apify = require('apify');
Apify.main(async () => {
    // Add URLs to a RequestList from a sitemap
    const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
    const requestList = await Apify.openRequestList('start-urls', sources);
    // Function called for each URL
    const handlePageFunction = async ({ request }) => {
        console.log(request.url);
    };
    // Create a crawler that runs Puppeteer
    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        handlePageFunction,
        maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
    });
    // Run the crawler
    await crawler.run();
});
Using PlaywrightCrawler:
To run this example on the Apify Platform, select the
apify/actor-node-playwright-chromeimage for your Dockerfile.
const Apify = require('apify');
Apify.main(async () => {
    // Add URLs to a RequestList from a sitemap
    const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
    const requestList = await Apify.openRequestList('start-urls', sources);
    // Function called for each URL
    const handlePageFunction = async ({ request }) => {
        console.log(request.url);
    };
    // Create a crawler that runs Playwright
    const crawler = new Apify.PlaywrightCrawler({
        requestList,
        handlePageFunction,
        maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
    });
    // Run the crawler
    await crawler.run();
});