104 lines
3.7 KiB
JavaScript
104 lines
3.7 KiB
JavaScript
import axios from './axios.js';
|
||
import * as cheerio from 'cheerio';
|
||
|
||
/**
|
||
* A generic function to scrape paginated data from the target ASP.NET website.
|
||
* @param {string} startUrl - The initial URL to begin scraping.
|
||
* @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object.
|
||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||
*/
|
||
export async function scrapePaginatedData(startUrl, extractDataFromHtml) {
|
||
let allData = [];
|
||
const origin = new URL(startUrl).origin;
|
||
|
||
console.log(`开始抓取第一页数据: ${startUrl}`);
|
||
let response = await axios.get(startUrl);
|
||
let $ = cheerio.load(response.data);
|
||
|
||
let firstPageData = await extractDataFromHtml($, origin);
|
||
allData = allData.concat(firstPageData);
|
||
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
|
||
|
||
// Get total pages for pagination
|
||
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
|
||
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
|
||
console.log(`共发现 ${totalPages} 页。`);
|
||
|
||
if (totalPages <= 1) {
|
||
console.log('\n抓取全站数据完毕!');
|
||
return allData;
|
||
}
|
||
|
||
// Collect form data for POST requests
|
||
const formValues = {};
|
||
$('input[name^="txt"], select').each((idx, el) => {
|
||
const name = $(el).attr('name');
|
||
if (name) {
|
||
formValues[name] = $(el).val() || '';
|
||
}
|
||
});
|
||
|
||
// Scrape remaining pages
|
||
for (let i = 2; i <= totalPages; i++) {
|
||
console.log(`正在抓取第 ${i} 页...`);
|
||
|
||
const viewState = $('#__VIEWSTATE').val();
|
||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||
|
||
if (!viewState) {
|
||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||
break;
|
||
}
|
||
|
||
const postData = new URLSearchParams();
|
||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||
postData.append('__EVENTARGUMENT', '');
|
||
postData.append('__VIEWSTATE', viewState);
|
||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||
if (eventValidation) {
|
||
postData.append('__EVENTVALIDATION', eventValidation);
|
||
}
|
||
|
||
for (const name in formValues) {
|
||
postData.append(name, formValues[name]);
|
||
}
|
||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||
|
||
response = await axios.post(startUrl, postData, {
|
||
headers: {
|
||
'Content-Type': 'application/x-www-form-urlencoded',
|
||
'Referer': startUrl,
|
||
}
|
||
});
|
||
|
||
$ = cheerio.load(response.data);
|
||
const nextPageData = await extractDataFromHtml($, origin);
|
||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||
|
||
if (nextPageData.length === 0) {
|
||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||
break;
|
||
}
|
||
allData = allData.concat(nextPageData);
|
||
}
|
||
|
||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||
return allData;
|
||
}
|
||
|
||
/**
|
||
* Re-indexes the "序号" field of each record in an array sequentially, starting from 1.
|
||
* @param {Array<Object>} dataArray - The array of data to re-index.
|
||
* @returns {Array<Object>} - The re-indexed data array.
|
||
*/
|
||
export function reIndex(dataArray) {
|
||
if (!dataArray || dataArray.length === 0) return [];
|
||
|
||
for (let i = 0; i < dataArray.length; i++) {
|
||
dataArray[i]['序号'] = (i + 1).toString();
|
||
}
|
||
console.log('序号字段已重新编号。');
|
||
return dataArray;
|
||
}
|