puning-real-estate/scripts/scraperUtils.js
2026-01-22 16:04:40 +08:00

104 lines
3.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from './axios.js';
import * as cheerio from 'cheerio';
/**
* A generic function to scrape paginated data from the target ASP.NET website.
* @param {string} startUrl - The initial URL to begin scraping.
* @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
export async function scrapePaginatedData(startUrl, extractDataFromHtml) {
let allData = [];
const origin = new URL(startUrl).origin;
console.log(`开始抓取第一页数据: ${startUrl}`);
let response = await axios.get(startUrl);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
// Get total pages for pagination
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
if (totalPages <= 1) {
console.log('\n抓取全站数据完毕');
return allData;
}
// Collect form data for POST requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Scrape remaining pages
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(startUrl, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': startUrl,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Re-indexes the "序号" field of each record in an array sequentially, starting from 1.
* @param {Array<Object>} dataArray - The array of data to re-index.
* @returns {Array<Object>} - The re-indexed data array.
*/
export function reIndex(dataArray) {
if (!dataArray || dataArray.length === 0) return [];
for (let i = 0; i < dataArray.length; i++) {
dataArray[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return dataArray;
}