scraperUtils

This commit is contained in:
秦秋旭 2026-01-22 15:44:57 +08:00
parent 8cdd473c48
commit f11b997db4
7 changed files with 162 additions and 324 deletions

View File

@ -174,8 +174,8 @@
"资质等级": "二级", "资质等级": "二级",
"核准预售套数": 236, "核准预售套数": 236,
"核准预售面积": "0", "核准预售面积": "0",
"已售总套数": 69, "已售总套数": 70,
"未售总套数": 167, "未售总套数": 166,
"已售总面积": "0", "已售总面积": "0",
"未售总面积": "0", "未售总面积": "0",
"楼盘销售部地址": "星河明珠湾营销中心", "楼盘销售部地址": "星河明珠湾营销中心",
@ -337,7 +337,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "C2幢", "楼幢名称": "C2幢",
"成交均价": "14476.55", "成交均价": "13967.14",
"bid": "1099728" "bid": "1099728"
}, },
{ {
@ -357,7 +357,7 @@
"批准时间": "2025-12-05", "批准时间": "2025-12-05",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 109, "总套数": 109,
"可售套数": 90, "可售套数": 89,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D"
} }
] ]
@ -895,8 +895,8 @@
"资质等级": "二级", "资质等级": "二级",
"核准预售套数": 1186, "核准预售套数": 1186,
"核准预售面积": "0", "核准预售面积": "0",
"已售总套数": 844, "已售总套数": 845,
"未售总套数": 342, "未售总套数": 341,
"已售总面积": "0", "已售总面积": "0",
"未售总面积": "0", "未售总面积": "0",
"楼盘销售部地址": "星河明珠湾花园", "楼盘销售部地址": "星河明珠湾花园",
@ -919,7 +919,7 @@
}, },
{ {
"楼幢名称": "B7幢", "楼幢名称": "B7幢",
"成交均价": "8468.21", "成交均价": "8464.22",
"bid": "1099699" "bid": "1099699"
} }
], ],
@ -930,7 +930,7 @@
"批准时间": "2026-01-21", "批准时间": "2026-01-21",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 114, "总套数": 114,
"可售套数": 56, "可售套数": 55,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D"
}, },
{ {
@ -2123,8 +2123,8 @@
"资质等级": "二级", "资质等级": "二级",
"核准预售套数": 2114, "核准预售套数": 2114,
"核准预售面积": "0", "核准预售面积": "0",
"已售总套数": 1817, "已售总套数": 1818,
"未售总套数": 297, "未售总套数": 296,
"已售总面积": "0", "已售总面积": "0",
"未售总面积": "0", "未售总面积": "0",
"楼盘销售部地址": "", "楼盘销售部地址": "",
@ -2450,7 +2450,7 @@
}, },
{ {
"楼幢名称": "14幢", "楼幢名称": "14幢",
"成交均价": "6356.32", "成交均价": "6512.01",
"bid": "1099566" "bid": "1099566"
} }
], ],
@ -2461,7 +2461,7 @@
"批准时间": "2025-12-02", "批准时间": "2025-12-02",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 301, "总套数": 301,
"可售套数": 176, "可售套数": 175,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D"
} }
] ]
@ -4197,8 +4197,8 @@
"资质等级": "二级", "资质等级": "二级",
"核准预售套数": 1118, "核准预售套数": 1118,
"核准预售面积": "0", "核准预售面积": "0",
"已售总套数": 992, "已售总套数": 994,
"未售总套数": 126, "未售总套数": 124,
"已售总面积": "0", "已售总面积": "0",
"未售总面积": "0", "未售总面积": "0",
"楼盘销售部地址": "普宁市北二环大道与铁山兰路交汇处", "楼盘销售部地址": "普宁市北二环大道与铁山兰路交汇处",
@ -4302,7 +4302,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1幢", "楼幢名称": "1幢",
"成交均价": "7266.38", "成交均价": "7270.26",
"bid": "1099705" "bid": "1099705"
} }
], ],
@ -4313,7 +4313,7 @@
"批准时间": "2025-10-24", "批准时间": "2025-10-24",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 160, "总套数": 160,
"可售套数": 101, "可售套数": 99,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D"
}, },
{ {
@ -10812,8 +10812,8 @@
"资质等级": "暂定资质", "资质等级": "暂定资质",
"核准预售套数": 398, "核准预售套数": 398,
"核准预售面积": "0", "核准预售面积": "0",
"已售总套数": 407, "已售总套数": 405,
"未售总套数": -9, "未售总套数": -7,
"已售总面积": "0", "已售总面积": "0",
"未售总面积": "0", "未售总面积": "0",
"楼盘销售部地址": "", "楼盘销售部地址": "",
@ -10826,7 +10826,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1号楼", "楼幢名称": "1号楼",
"成交均价": "6168.59", "成交均价": "6176.77",
"bid": "-664" "bid": "-664"
}, },
{ {
@ -10847,7 +10847,7 @@
"批准时间": "2020-08-20", "批准时间": "2020-08-20",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 398, "总套数": 398,
"可售套数": -9, "可售套数": -7,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D"
} }
] ]

View File

@ -32,7 +32,7 @@
"批准时间": "2026-01-21", "批准时间": "2026-01-21",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 114, "总套数": 114,
"可售套数": 56, "可售套数": 55,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D"
}, },
{ {
@ -152,7 +152,7 @@
"批准时间": "2025-12-05", "批准时间": "2025-12-05",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 109, "总套数": 109,
"可售套数": 90, "可售套数": 89,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D"
}, },
{ {
@ -212,7 +212,7 @@
"批准时间": "2025-12-02", "批准时间": "2025-12-02",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 301, "总套数": 301,
"可售套数": 176, "可售套数": 175,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D"
}, },
{ {
@ -284,7 +284,7 @@
"批准时间": "2025-10-24", "批准时间": "2025-10-24",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 160, "总套数": 160,
"可售套数": 101, "可售套数": 99,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D"
}, },
{ {
@ -1628,7 +1628,7 @@
"批准时间": "2020-08-20", "批准时间": "2020-08-20",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 398, "总套数": 398,
"可售套数": -9, "可售套数": -7,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D"
}, },
{ {

View File

@ -247,7 +247,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "C2幢", "楼幢名称": "C2幢",
"成交均价": "14476.55", "成交均价": "13967.14",
"bid": "1099728" "bid": "1099728"
}, },
{ {
@ -730,7 +730,7 @@
}, },
{ {
"楼幢名称": "B7幢", "楼幢名称": "B7幢",
"成交均价": "8468.21", "成交均价": "8464.22",
"bid": "1099699" "bid": "1099699"
} }
] ]
@ -1892,7 +1892,7 @@
}, },
{ {
"楼幢名称": "14幢", "楼幢名称": "14幢",
"成交均价": "6356.32", "成交均价": "6512.01",
"bid": "1099566" "bid": "1099566"
} }
] ]
@ -3294,7 +3294,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1幢", "楼幢名称": "1幢",
"成交均价": "7266.38", "成交均价": "7270.26",
"bid": "1099705" "bid": "1099705"
} }
] ]
@ -8837,7 +8837,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1号楼", "楼幢名称": "1号楼",
"成交均价": "6168.59", "成交均价": "6176.77",
"bid": "-664" "bid": "-664"
}, },
{ {

View File

@ -2,11 +2,12 @@ import axios from './axios.js';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { fileURLToPath } from 'url'; // Add fileURLToPath import import { fileURLToPath } from 'url';
import { getEncryptedUrl } from './getEncryptedUrl.js'; // Import shared encryption function import { getEncryptedUrl } from './getEncryptedUrl.js';
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
const __filename = fileURLToPath(import.meta.url); // Define __filename const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename); // Define __dirname const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HEMS/'; const BASE_URL = 'http://120.236.48.169:89/HEMS/';
const START_URL = BASE_URL + 'CompanyList.aspx'; const START_URL = BASE_URL + 'CompanyList.aspx';
@ -52,111 +53,13 @@ async function extractDataFromHtml($, origin) {
return data.filter(item => item !== null); return data.filter(item => item !== null);
} }
/**
* Scrapes all company data from the target website, handling pagination.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
async function scrapeWebsite() {
let allData = [];
const origin = new URL(START_URL).origin;
console.log('开始抓取公司列表第一页数据...');
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
// Get total pages for pagination
const totalRecordsSpan = $('#PageNavigator1_LblRecordCount');
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
const totalRecords = totalRecordsSpan.length ? parseInt(totalRecordsSpan.text(), 10) : 0;
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
console.log(`共发现 ${totalRecords} 条记录,分为 ${totalPages} 页。`);
// Collect form data for POST requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Scrape remaining pages
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// Add form fields
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站公司数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Performs post-processing on the scraped data:
* 1. Re-indexes "序号" sequentially.
* @param {Array<Object>} allData - The raw scraped data.
* @returns {Array<Object>} - The processed data.
*/
function processScrapedData(allData) {
// Re-index "序号" sequentially
for (let i = 0; i < allData.length; i++) {
allData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return allData;
}
// Main function to orchestrate the scraping and saving process // Main function to orchestrate the scraping and saving process
export async function main() { export async function main() {
try { try {
console.log('🚀 开始抓取普宁房地产开发企业列表...'); console.log('🚀 开始抓取普宁房地产开发企业列表...');
const allData = await scrapeWebsite(); const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
const processedData = processScrapedData(allData); const processedData = reIndex(allData); // Use generic re-indexing
const dataPath = path.join(__dirname, '..', 'data', 'companies.json'); const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists

View File

@ -3,6 +3,7 @@ import * as cheerio from 'cheerio';
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { getEncryptedUrl } from './getEncryptedUrl.js'; import { getEncryptedUrl } from './getEncryptedUrl.js';
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx'; const START_URL = BASE_URL + 'presellCertList.aspx';
@ -48,99 +49,21 @@ async function extractDataFromHtml($, origin) {
return data.filter(item => item !== null); return data.filter(item => item !== null);
} }
/**
* Scrapes all data from the target website, handling pagination.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
async function scrapeWebsite() {
let allData = [];
const origin = new URL(BASE_URL).origin;
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
allData = await extractDataFromHtml($, origin);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if(eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/** /**
* Performs post-processing on the scraped data: * Performs post-processing on the scraped data:
* 1. Filters out records where "许可证号" is "空". * 1. Filters out records where "许可证号" is "空".
* 2. Re-indexes "序号" sequentially. * 2. Re-indexes "序号" sequentially using the utility function.
* 3. Converts "总套数" and "可售套数" fields to numbers. * 3. Converts "总套数" and "可售套数" fields to numbers.
* @param {Array<Object>} allData - The raw scraped data. * @param {Array<Object>} allData - The raw scraped data.
* @returns {Promise<Array<Object>>} - The processed data. * @returns {Promise<Array<Object>>} - The processed data.
*/ */
async function processScrapedData(allData) { async function processScrapedData(allData) {
// 1. Filter out records where "许可证号" is "空" // 1. Filter out records where "许可证号" is "空" or null
let processedData = allData.filter(record => record['许可证号'] !== '空'); let processedData = allData.filter(record => record['许可证号'] !== '空' && record['许可证号']);
console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`); console.log(`删除无效许可证记录后,剩余 ${processedData.length} 条记录。`);
// 2. Re-index "序号" sequentially // 2. Re-index "序号" sequentially
for (let i = 0; i < processedData.length; i++) { reIndex(processedData); // Use the shared utility function
processedData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
// 3. Convert "总套数" and "可售套数" to numbers // 3. Convert "总套数" and "可售套数" to numbers
for (const record of processedData) { for (const record of processedData) {
@ -155,14 +78,15 @@ async function processScrapedData(allData) {
// 主函数 - 导出以便在根目录调用 // 主函数 - 导出以便在根目录调用
export async function main() { export async function main() {
try { try {
const allData = await scrapeWebsite(); console.log('🚀 开始抓取预售许可证数据...');
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
const processedData = await processScrapedData(allData); const processedData = await processScrapedData(allData);
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json'); const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true }); await fs.mkdir(path.dirname(dataPath), { recursive: true });
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`); console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
console.log('\n所有数据处理和文件生成任务已完成。'); console.log('\n所有数据处理和文件生成任务已完成。');

View File

@ -5,6 +5,7 @@ import path from 'path';
import { fileURLToPath } from 'url'; import { fileURLToPath } from 'url';
import { getEncryptedUrl } from './getEncryptedUrl.js'; import { getEncryptedUrl } from './getEncryptedUrl.js';
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
const __filename = fileURLToPath(import.meta.url); const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename); const __dirname = path.dirname(__filename);
@ -122,113 +123,20 @@ async function scrapeProjectDetails(project) {
} }
/** // The old scrapeWebsite function is removed.
* 抓取网站所有数据处理分页
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
const origin = new URL(START_URL).origin;
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// 获取总页数
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// 收集表单数据用于POST请求
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// 抓取其余页面
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// 添加表单字段
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
console.log('\n开始抓取项目详情...');
const detailedDataPromises = allData.map(project => scrapeProjectDetails(project));
const detailedData = await Promise.all(detailedDataPromises);
console.log('所有项目详情抓取完毕。');
return detailedData;
}
/**
* 对抓取的数据进行后处理
* 1. 重新编号序号字段
* @param {Array<Object>} allData - 原始抓取数据
* @returns {Promise<Array<Object>>} - 处理后的数据
*/
async function processScrapedData(allData) {
// 重新编号序号字段
for (let i = 0; i < allData.length; i++) {
allData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return allData;
}
// 主函数 - 导出以便在根目录调用 // 主函数 - 导出以便在根目录调用
export async function main() { export async function main() {
try { try {
const allData = await scrapeWebsite(); console.log('🚀 开始抓取项目数据...');
const processedData = await processScrapedData(allData); const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml);
console.log('\n开始抓取项目详情...');
const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project));
const allData = await Promise.all(detailedDataPromises);
console.log('所有项目详情抓取完毕。');
const processedData = reIndex(allData); // Use generic re-indexing
// 保存为JSON文件 // 保存为JSON文件
const dataPath = path.join(__dirname, '..', 'data', 'project.json'); const dataPath = path.join(__dirname, '..', 'data', 'project.json');

103
scripts/scraperUtils.js Normal file
View File

@ -0,0 +1,103 @@
import axios from './axios.js';
import * as cheerio from 'cheerio';
/**
* A generic function to scrape paginated data from the target ASP.NET website.
* @param {string} startUrl - The initial URL to begin scraping.
* @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
export async function scrapePaginatedData(startUrl, extractDataFromHtml) {
let allData = [];
const origin = new URL(startUrl).origin;
console.log(`开始抓取第一页数据: ${startUrl}`);
let response = await axios.get(startUrl);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
// Get total pages for pagination
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
if (totalPages <= 1) {
console.log('\n抓取全站数据完毕');
return allData;
}
// Collect form data for POST requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Scrape remaining pages
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(startUrl, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': startUrl,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Re-indexes the "序号" field of each record in an array sequentially, starting from 1.
* @param {Array<Object>} dataArray - The array of data to re-index.
* @returns {Array<Object>} - The re-indexed data array.
*/
export function reIndex(dataArray) {
if (!dataArray || dataArray.length === 0) return [];
for (let i = 0; i < dataArray.length; i++) {
dataArray[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return dataArray;
}