// scraper.js import fetch from 'node-fetch'; import * as cheerio from 'cheerio'; import { URL } from 'url'; /** * Scrape a page and extract fields based on CSS selectors * @param {string} pageUrl - The URL to scrape * @param {Object} selectors - { fieldName: { css: string, attr?: string, multiple?: boolean } } */ async function scrapePage(pageUrl, selectors) { console.log(`Fetching: ${pageUrl}`); const res = await fetch(pageUrl, { headers: { 'User-Agent': 'ParserBotJS/1.0 (+https://example.com/bot)' } }); if (!res.ok) { console.error(`Failed to fetch ${pageUrl}: ${res.status}`); return null; } const html = await res.text(); const $ = cheerio.load(html); const result = { url: pageUrl }; for (const [field, sel] of Object.entries(selectors)) { const elements = $(sel.css); if (sel.multiple) { result[field] = elements.map((_, el) => sel.attr ? $(el).attr(sel.attr) : $(el).text().trim()).get(); } else { const el = elements.first(); result[field] = sel.attr ? el.attr(sel.attr) : el.text().trim(); } } return result; } // Example usage: (async () => { const selectors = { title: { css: 'h1' }, links: { css: 'a', attr: 'href', multiple: true } }; const data = await scrapePage('https://example.org', selectors); console.log(JSON.stringify(data, null, 2)); })();

0 Comments