import axios from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
// XML Encode
function xmlEncode(text) {
return text.replace(/&/g, '&').replace(/
- ${xmlEncode(queryString)}
`;
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
try {
const response = await axios.post(encryptionUrl, xmlPayload, {
headers: {
'Content-Type': 'application/xml',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
});
const encryptedQuery = response.data;
if (encryptedQuery) {
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
}
} catch (error) {
console.error(`加密链接失败: ${relativeUrl}`, error.message);
}
// Fallback to original url on error
return new URL(relativeUrl, origin).href;
}
// 从HTML中提取表格数据的函数
async function extractDataFromHtml($, origin) {
const rows = $('.resultlist table tr:has(td)').get();
const dataPromises = rows.map(async (row) => {
const columns = $(row).find('td');
if (columns.length < 4) {
return null;
}
const projectNameCell = $(columns[1]);
const projectNameLink = projectNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目地址': $(columns[3]).text().trim(),
'项目链接': '',
};
if (projectNameLink.length) {
const onclickAttr = projectNameLink.attr('onclick');
if (onclickAttr) {
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
rowData['项目链接'] = await getEncryptedUrl(match[1], origin);
}
}
}
return rowData;
});
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
}
/**
* 抓取网站所有数据,处理分页
* @returns {Promise>} - 所有抓取的数据记录数组
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
const origin = new URL(START_URL).origin;
let response = await axios.get(START_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// 获取总页数
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// 收集表单数据,用于POST请求
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// 抓取其余页面
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE,终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// 添加表单字段
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`第 ${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* 对抓取的数据进行后处理:
* 1. 重新编号序号字段
* @param {Array