import axios from './axios.js'; import * as cheerio from 'cheerio'; /** * A generic function to scrape paginated data from the target ASP.NET website. * @param {string} startUrl - The initial URL to begin scraping. * @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object. * @returns {Promise>} - An array of all scraped data records. */ export async function scrapePaginatedData(startUrl, extractDataFromHtml) { let allData = []; const origin = new URL(startUrl).origin; console.log(`开始抓取第一页数据: ${startUrl}`); let response = await axios.get(startUrl); let $ = cheerio.load(response.data); let firstPageData = await extractDataFromHtml($, origin); allData = allData.concat(firstPageData); console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`); // Get total pages for pagination const totalPagesSpan = $('#PageNavigator1_LblPageCount'); const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1; console.log(`共发现 ${totalPages} 页。`); if (totalPages <= 1) { console.log('\n抓取全站数据完毕!'); return allData; } // Collect form data for POST requests const formValues = {}; $('input[name^="txt"], select').each((idx, el) => { const name = $(el).attr('name'); if (name) { formValues[name] = $(el).val() || ''; } }); // Scrape remaining pages for (let i = 2; i <= totalPages; i++) { console.log(`正在抓取第 ${i} 页...`); const viewState = $('#__VIEWSTATE').val(); const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); const eventValidation = $('#__EVENTVALIDATION').val(); if (!viewState) { console.log('无法找到 __VIEWSTATE,终止抓取。'); break; } const postData = new URLSearchParams(); postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); postData.append('__EVENTARGUMENT', ''); postData.append('__VIEWSTATE', viewState); postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); if (eventValidation) { postData.append('__EVENTVALIDATION', eventValidation); } for (const name in formValues) { postData.append(name, formValues[name]); } postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); response = await axios.post(startUrl, postData, { headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': startUrl, } }); $ = cheerio.load(response.data); const nextPageData = await extractDataFromHtml($, origin); console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); if (nextPageData.length === 0) { console.log(`第 ${i} 页没有数据,抓取结束。`); break; } allData = allData.concat(nextPageData); } console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); return allData; } /** * Re-indexes the "序号" field of each record in an array sequentially, starting from 1. * @param {Array} dataArray - The array of data to re-index. * @returns {Array} - The re-indexed data array. */ export function reIndex(dataArray) { if (!dataArray || dataArray.length === 0) return []; for (let i = 0; i < dataArray.length; i++) { dataArray[i]['序号'] = (i + 1).toString(); } console.log('序号字段已重新编号。'); return dataArray; }