scraperUtils
This commit is contained in:
parent
8cdd473c48
commit
f11b997db4
@ -174,8 +174,8 @@
|
||||
"资质等级": "二级",
|
||||
"核准预售套数": 236,
|
||||
"核准预售面积": "0",
|
||||
"已售总套数": 69,
|
||||
"未售总套数": 167,
|
||||
"已售总套数": 70,
|
||||
"未售总套数": 166,
|
||||
"已售总面积": "0",
|
||||
"未售总面积": "0",
|
||||
"楼盘销售部地址": "星河明珠湾营销中心",
|
||||
@ -337,7 +337,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "C2幢",
|
||||
"成交均价": "14476.55",
|
||||
"成交均价": "13967.14",
|
||||
"bid": "1099728"
|
||||
},
|
||||
{
|
||||
@ -357,7 +357,7 @@
|
||||
"批准时间": "2025-12-05",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 109,
|
||||
"可售套数": 90,
|
||||
"可售套数": 89,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D"
|
||||
}
|
||||
]
|
||||
@ -895,8 +895,8 @@
|
||||
"资质等级": "二级",
|
||||
"核准预售套数": 1186,
|
||||
"核准预售面积": "0",
|
||||
"已售总套数": 844,
|
||||
"未售总套数": 342,
|
||||
"已售总套数": 845,
|
||||
"未售总套数": 341,
|
||||
"已售总面积": "0",
|
||||
"未售总面积": "0",
|
||||
"楼盘销售部地址": "星河明珠湾花园",
|
||||
@ -919,7 +919,7 @@
|
||||
},
|
||||
{
|
||||
"楼幢名称": "B7幢",
|
||||
"成交均价": "8468.21",
|
||||
"成交均价": "8464.22",
|
||||
"bid": "1099699"
|
||||
}
|
||||
],
|
||||
@ -930,7 +930,7 @@
|
||||
"批准时间": "2026-01-21",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 114,
|
||||
"可售套数": 56,
|
||||
"可售套数": 55,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -2123,8 +2123,8 @@
|
||||
"资质等级": "二级",
|
||||
"核准预售套数": 2114,
|
||||
"核准预售面积": "0",
|
||||
"已售总套数": 1817,
|
||||
"未售总套数": 297,
|
||||
"已售总套数": 1818,
|
||||
"未售总套数": 296,
|
||||
"已售总面积": "0",
|
||||
"未售总面积": "0",
|
||||
"楼盘销售部地址": "",
|
||||
@ -2450,7 +2450,7 @@
|
||||
},
|
||||
{
|
||||
"楼幢名称": "14幢",
|
||||
"成交均价": "6356.32",
|
||||
"成交均价": "6512.01",
|
||||
"bid": "1099566"
|
||||
}
|
||||
],
|
||||
@ -2461,7 +2461,7 @@
|
||||
"批准时间": "2025-12-02",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 301,
|
||||
"可售套数": 176,
|
||||
"可售套数": 175,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D"
|
||||
}
|
||||
]
|
||||
@ -4197,8 +4197,8 @@
|
||||
"资质等级": "二级",
|
||||
"核准预售套数": 1118,
|
||||
"核准预售面积": "0",
|
||||
"已售总套数": 992,
|
||||
"未售总套数": 126,
|
||||
"已售总套数": 994,
|
||||
"未售总套数": 124,
|
||||
"已售总面积": "0",
|
||||
"未售总面积": "0",
|
||||
"楼盘销售部地址": "普宁市北二环大道与铁山兰路交汇处",
|
||||
@ -4302,7 +4302,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1幢",
|
||||
"成交均价": "7266.38",
|
||||
"成交均价": "7270.26",
|
||||
"bid": "1099705"
|
||||
}
|
||||
],
|
||||
@ -4313,7 +4313,7 @@
|
||||
"批准时间": "2025-10-24",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 160,
|
||||
"可售套数": 101,
|
||||
"可售套数": 99,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -10812,8 +10812,8 @@
|
||||
"资质等级": "暂定资质",
|
||||
"核准预售套数": 398,
|
||||
"核准预售面积": "0",
|
||||
"已售总套数": 407,
|
||||
"未售总套数": -9,
|
||||
"已售总套数": 405,
|
||||
"未售总套数": -7,
|
||||
"已售总面积": "0",
|
||||
"未售总面积": "0",
|
||||
"楼盘销售部地址": "",
|
||||
@ -10826,7 +10826,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1号楼",
|
||||
"成交均价": "6168.59",
|
||||
"成交均价": "6176.77",
|
||||
"bid": "-664"
|
||||
},
|
||||
{
|
||||
@ -10847,7 +10847,7 @@
|
||||
"批准时间": "2020-08-20",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 398,
|
||||
"可售套数": -9,
|
||||
"可售套数": -7,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D"
|
||||
}
|
||||
]
|
||||
|
||||
@ -32,7 +32,7 @@
|
||||
"批准时间": "2026-01-21",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 114,
|
||||
"可售套数": 56,
|
||||
"可售套数": 55,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -152,7 +152,7 @@
|
||||
"批准时间": "2025-12-05",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 109,
|
||||
"可售套数": 90,
|
||||
"可售套数": 89,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -212,7 +212,7 @@
|
||||
"批准时间": "2025-12-02",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 301,
|
||||
"可售套数": 176,
|
||||
"可售套数": 175,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -284,7 +284,7 @@
|
||||
"批准时间": "2025-10-24",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 160,
|
||||
"可售套数": 101,
|
||||
"可售套数": 99,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D"
|
||||
},
|
||||
{
|
||||
@ -1628,7 +1628,7 @@
|
||||
"批准时间": "2020-08-20",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 398,
|
||||
"可售套数": -9,
|
||||
"可售套数": -7,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D"
|
||||
},
|
||||
{
|
||||
|
||||
@ -247,7 +247,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "C2幢",
|
||||
"成交均价": "14476.55",
|
||||
"成交均价": "13967.14",
|
||||
"bid": "1099728"
|
||||
},
|
||||
{
|
||||
@ -730,7 +730,7 @@
|
||||
},
|
||||
{
|
||||
"楼幢名称": "B7幢",
|
||||
"成交均价": "8468.21",
|
||||
"成交均价": "8464.22",
|
||||
"bid": "1099699"
|
||||
}
|
||||
]
|
||||
@ -1892,7 +1892,7 @@
|
||||
},
|
||||
{
|
||||
"楼幢名称": "14幢",
|
||||
"成交均价": "6356.32",
|
||||
"成交均价": "6512.01",
|
||||
"bid": "1099566"
|
||||
}
|
||||
]
|
||||
@ -3294,7 +3294,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1幢",
|
||||
"成交均价": "7266.38",
|
||||
"成交均价": "7270.26",
|
||||
"bid": "1099705"
|
||||
}
|
||||
]
|
||||
@ -8837,7 +8837,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1号楼",
|
||||
"成交均价": "6168.59",
|
||||
"成交均价": "6176.77",
|
||||
"bid": "-664"
|
||||
},
|
||||
{
|
||||
|
||||
@ -2,11 +2,12 @@ import axios from './axios.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url'; // Add fileURLToPath import
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js'; // Import shared encryption function
|
||||
import { fileURLToPath } from 'url';
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url); // Define __filename
|
||||
const __dirname = path.dirname(__filename); // Define __dirname
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HEMS/';
|
||||
const START_URL = BASE_URL + 'CompanyList.aspx';
|
||||
@ -52,111 +53,13 @@ async function extractDataFromHtml($, origin) {
|
||||
return data.filter(item => item !== null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrapes all company data from the target website, handling pagination.
|
||||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||||
*/
|
||||
async function scrapeWebsite() {
|
||||
let allData = [];
|
||||
const origin = new URL(START_URL).origin;
|
||||
|
||||
console.log('开始抓取公司列表第一页数据...');
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
let firstPageData = await extractDataFromHtml($, origin);
|
||||
allData = allData.concat(firstPageData);
|
||||
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
|
||||
|
||||
// Get total pages for pagination
|
||||
const totalRecordsSpan = $('#PageNavigator1_LblRecordCount');
|
||||
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
|
||||
|
||||
const totalRecords = totalRecordsSpan.length ? parseInt(totalRecordsSpan.text(), 10) : 0;
|
||||
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
|
||||
console.log(`共发现 ${totalRecords} 条记录,分为 ${totalPages} 页。`);
|
||||
|
||||
// Collect form data for POST requests
|
||||
const formValues = {};
|
||||
$('input[name^="txt"], select').each((idx, el) => {
|
||||
const name = $(el).attr('name');
|
||||
if (name) {
|
||||
formValues[name] = $(el).val() || '';
|
||||
}
|
||||
});
|
||||
|
||||
// Scrape remaining pages
|
||||
for (let i = 2; i <= totalPages; i++) {
|
||||
console.log(`正在抓取第 ${i} 页...`);
|
||||
|
||||
const viewState = $('#__VIEWSTATE').val();
|
||||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||||
|
||||
if (!viewState) {
|
||||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||||
break;
|
||||
}
|
||||
|
||||
const postData = new URLSearchParams();
|
||||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||||
postData.append('__EVENTARGUMENT', '');
|
||||
postData.append('__VIEWSTATE', viewState);
|
||||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||||
if (eventValidation) {
|
||||
postData.append('__EVENTVALIDATION', eventValidation);
|
||||
}
|
||||
|
||||
// Add form fields
|
||||
for (const name in formValues) {
|
||||
postData.append(name, formValues[name]);
|
||||
}
|
||||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||||
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = await extractDataFromHtml($, origin);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||||
break;
|
||||
}
|
||||
allData = allData.concat(nextPageData);
|
||||
}
|
||||
|
||||
console.log(`\n抓取全站公司数据完毕!共 ${allData.length} 条原始记录。`);
|
||||
return allData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs post-processing on the scraped data:
|
||||
* 1. Re-indexes "序号" sequentially.
|
||||
* @param {Array<Object>} allData - The raw scraped data.
|
||||
* @returns {Array<Object>} - The processed data.
|
||||
*/
|
||||
function processScrapedData(allData) {
|
||||
// Re-index "序号" sequentially
|
||||
for (let i = 0; i < allData.length; i++) {
|
||||
allData[i]['序号'] = (i + 1).toString();
|
||||
}
|
||||
console.log('序号字段已重新编号。');
|
||||
|
||||
return allData;
|
||||
}
|
||||
|
||||
// Main function to orchestrate the scraping and saving process
|
||||
export async function main() {
|
||||
try {
|
||||
console.log('🚀 开始抓取普宁房地产开发企业列表...');
|
||||
const allData = await scrapeWebsite();
|
||||
const processedData = processScrapedData(allData);
|
||||
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
|
||||
const processedData = reIndex(allData); // Use generic re-indexing
|
||||
|
||||
const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
|
||||
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists
|
||||
|
||||
@ -3,6 +3,7 @@ import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||||
@ -48,99 +49,21 @@ async function extractDataFromHtml($, origin) {
|
||||
return data.filter(item => item !== null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Scrapes all data from the target website, handling pagination.
|
||||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||||
*/
|
||||
async function scrapeWebsite() {
|
||||
let allData = [];
|
||||
const origin = new URL(BASE_URL).origin;
|
||||
|
||||
console.log('开始抓取第一页数据...');
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
allData = await extractDataFromHtml($, origin);
|
||||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||||
|
||||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||||
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
|
||||
console.log(`共发现 ${totalPages} 页。`);
|
||||
|
||||
const formValues = {};
|
||||
$('input[name^="txt"], select').each((idx, el) => {
|
||||
const name = $(el).attr('name');
|
||||
if (name) {
|
||||
formValues[name] = $(el).val() || '';
|
||||
}
|
||||
});
|
||||
|
||||
for (let i = 2; i <= totalPages; i++) {
|
||||
console.log(`正在抓取第 ${i} 页...`);
|
||||
|
||||
const viewState = $('#__VIEWSTATE').val();
|
||||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||||
|
||||
if (!viewState) {
|
||||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||||
break;
|
||||
}
|
||||
|
||||
const postData = new URLSearchParams();
|
||||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||||
postData.append('__EVENTARGUMENT', '');
|
||||
postData.append('__VIEWSTATE', viewState);
|
||||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||||
if(eventValidation) {
|
||||
postData.append('__EVENTVALIDATION', eventValidation);
|
||||
}
|
||||
|
||||
for (const name in formValues) {
|
||||
postData.append(name, formValues[name]);
|
||||
}
|
||||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||||
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = await extractDataFromHtml($, origin);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||||
break;
|
||||
}
|
||||
allData = allData.concat(nextPageData);
|
||||
}
|
||||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||||
return allData;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Performs post-processing on the scraped data:
|
||||
* 1. Filters out records where "许可证号" is "空".
|
||||
* 2. Re-indexes "序号" sequentially.
|
||||
* 2. Re-indexes "序号" sequentially using the utility function.
|
||||
* 3. Converts "总套数" and "可售套数" fields to numbers.
|
||||
* @param {Array<Object>} allData - The raw scraped data.
|
||||
* @returns {Promise<Array<Object>>} - The processed data.
|
||||
*/
|
||||
async function processScrapedData(allData) {
|
||||
// 1. Filter out records where "许可证号" is "空"
|
||||
let processedData = allData.filter(record => record['许可证号'] !== '空');
|
||||
console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`);
|
||||
// 1. Filter out records where "许可证号" is "空" or null
|
||||
let processedData = allData.filter(record => record['许可证号'] !== '空' && record['许可证号']);
|
||||
console.log(`删除无效许可证记录后,剩余 ${processedData.length} 条记录。`);
|
||||
|
||||
// 2. Re-index "序号" sequentially
|
||||
for (let i = 0; i < processedData.length; i++) {
|
||||
processedData[i]['序号'] = (i + 1).toString();
|
||||
}
|
||||
console.log('序号字段已重新编号。');
|
||||
reIndex(processedData); // Use the shared utility function
|
||||
|
||||
// 3. Convert "总套数" and "可售套数" to numbers
|
||||
for (const record of processedData) {
|
||||
@ -155,14 +78,15 @@ async function processScrapedData(allData) {
|
||||
// 主函数 - 导出以便在根目录调用
|
||||
export async function main() {
|
||||
try {
|
||||
const allData = await scrapeWebsite();
|
||||
console.log('🚀 开始抓取预售许可证数据...');
|
||||
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
|
||||
|
||||
const processedData = await processScrapedData(allData);
|
||||
|
||||
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
||||
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
||||
console.log(`✅ 更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
||||
|
||||
console.log('\n所有数据处理和文件生成任务已完成。');
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
@ -122,113 +123,20 @@ async function scrapeProjectDetails(project) {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 抓取网站所有数据,处理分页
|
||||
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
|
||||
*/
|
||||
async function scrapeWebsite() {
|
||||
let allData = [];
|
||||
console.log('开始抓取第一页数据...');
|
||||
const origin = new URL(START_URL).origin;
|
||||
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
let firstPageData = await extractDataFromHtml($, origin);
|
||||
allData = allData.concat(firstPageData);
|
||||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||||
|
||||
// 获取总页数
|
||||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||||
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
|
||||
console.log(`共发现 ${totalPages} 页。`);
|
||||
|
||||
// 收集表单数据,用于POST请求
|
||||
const formValues = {};
|
||||
$('input[name^="txt"], select').each((idx, el) => {
|
||||
const name = $(el).attr('name');
|
||||
if (name) {
|
||||
formValues[name] = $(el).val() || '';
|
||||
}
|
||||
});
|
||||
|
||||
// 抓取其余页面
|
||||
for (let i = 2; i <= totalPages; i++) {
|
||||
console.log(`正在抓取第 ${i} 页...`);
|
||||
|
||||
const viewState = $('#__VIEWSTATE').val();
|
||||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||||
|
||||
if (!viewState) {
|
||||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||||
break;
|
||||
}
|
||||
|
||||
const postData = new URLSearchParams();
|
||||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||||
postData.append('__EVENTARGUMENT', '');
|
||||
postData.append('__VIEWSTATE', viewState);
|
||||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||||
if (eventValidation) {
|
||||
postData.append('__EVENTVALIDATION', eventValidation);
|
||||
}
|
||||
|
||||
// 添加表单字段
|
||||
for (const name in formValues) {
|
||||
postData.append(name, formValues[name]);
|
||||
}
|
||||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||||
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = await extractDataFromHtml($, origin);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||||
break;
|
||||
}
|
||||
allData = allData.concat(nextPageData);
|
||||
}
|
||||
|
||||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||||
|
||||
console.log('\n开始抓取项目详情...');
|
||||
const detailedDataPromises = allData.map(project => scrapeProjectDetails(project));
|
||||
const detailedData = await Promise.all(detailedDataPromises);
|
||||
console.log('所有项目详情抓取完毕。');
|
||||
|
||||
return detailedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* 对抓取的数据进行后处理:
|
||||
* 1. 重新编号序号字段
|
||||
* @param {Array<Object>} allData - 原始抓取数据
|
||||
* @returns {Promise<Array<Object>>} - 处理后的数据
|
||||
*/
|
||||
async function processScrapedData(allData) {
|
||||
// 重新编号序号字段
|
||||
for (let i = 0; i < allData.length; i++) {
|
||||
allData[i]['序号'] = (i + 1).toString();
|
||||
}
|
||||
console.log('序号字段已重新编号。');
|
||||
|
||||
return allData;
|
||||
}
|
||||
// The old scrapeWebsite function is removed.
|
||||
|
||||
// 主函数 - 导出以便在根目录调用
|
||||
export async function main() {
|
||||
try {
|
||||
const allData = await scrapeWebsite();
|
||||
const processedData = await processScrapedData(allData);
|
||||
console.log('🚀 开始抓取项目数据...');
|
||||
const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml);
|
||||
|
||||
console.log('\n开始抓取项目详情...');
|
||||
const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project));
|
||||
const allData = await Promise.all(detailedDataPromises);
|
||||
console.log('所有项目详情抓取完毕。');
|
||||
|
||||
const processedData = reIndex(allData); // Use generic re-indexing
|
||||
|
||||
// 保存为JSON文件
|
||||
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
||||
|
||||
103
scripts/scraperUtils.js
Normal file
103
scripts/scraperUtils.js
Normal file
@ -0,0 +1,103 @@
|
||||
import axios from './axios.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
/**
|
||||
* A generic function to scrape paginated data from the target ASP.NET website.
|
||||
* @param {string} startUrl - The initial URL to begin scraping.
|
||||
* @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object.
|
||||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||||
*/
|
||||
export async function scrapePaginatedData(startUrl, extractDataFromHtml) {
|
||||
let allData = [];
|
||||
const origin = new URL(startUrl).origin;
|
||||
|
||||
console.log(`开始抓取第一页数据: ${startUrl}`);
|
||||
let response = await axios.get(startUrl);
|
||||
let $ = cheerio.load(response.data);
|
||||
|
||||
let firstPageData = await extractDataFromHtml($, origin);
|
||||
allData = allData.concat(firstPageData);
|
||||
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
|
||||
|
||||
// Get total pages for pagination
|
||||
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
|
||||
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
|
||||
console.log(`共发现 ${totalPages} 页。`);
|
||||
|
||||
if (totalPages <= 1) {
|
||||
console.log('\n抓取全站数据完毕!');
|
||||
return allData;
|
||||
}
|
||||
|
||||
// Collect form data for POST requests
|
||||
const formValues = {};
|
||||
$('input[name^="txt"], select').each((idx, el) => {
|
||||
const name = $(el).attr('name');
|
||||
if (name) {
|
||||
formValues[name] = $(el).val() || '';
|
||||
}
|
||||
});
|
||||
|
||||
// Scrape remaining pages
|
||||
for (let i = 2; i <= totalPages; i++) {
|
||||
console.log(`正在抓取第 ${i} 页...`);
|
||||
|
||||
const viewState = $('#__VIEWSTATE').val();
|
||||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||||
|
||||
if (!viewState) {
|
||||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||||
break;
|
||||
}
|
||||
|
||||
const postData = new URLSearchParams();
|
||||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||||
postData.append('__EVENTARGUMENT', '');
|
||||
postData.append('__VIEWSTATE', viewState);
|
||||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||||
if (eventValidation) {
|
||||
postData.append('__EVENTVALIDATION', eventValidation);
|
||||
}
|
||||
|
||||
for (const name in formValues) {
|
||||
postData.append(name, formValues[name]);
|
||||
}
|
||||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||||
|
||||
response = await axios.post(startUrl, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'Referer': startUrl,
|
||||
}
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = await extractDataFromHtml($, origin);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||||
break;
|
||||
}
|
||||
allData = allData.concat(nextPageData);
|
||||
}
|
||||
|
||||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||||
return allData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Re-indexes the "序号" field of each record in an array sequentially, starting from 1.
|
||||
* @param {Array<Object>} dataArray - The array of data to re-index.
|
||||
* @returns {Array<Object>} - The re-indexed data array.
|
||||
*/
|
||||
export function reIndex(dataArray) {
|
||||
if (!dataArray || dataArray.length === 0) return [];
|
||||
|
||||
for (let i = 0; i < dataArray.length; i++) {
|
||||
dataArray[i]['序号'] = (i + 1).toString();
|
||||
}
|
||||
console.log('序号字段已重新编号。');
|
||||
return dataArray;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user