puning-real-estate/Project/index.js

167 lines
5.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
// 从HTML中提取表格数据的函数
function extractDataFromHtml($) {
const data = [];
// 查找包含数据的表格行,跳过表头
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td');
// 项目信息表格有4列序号、项目名称、开发企业、项目地址
if (columns.length >= 4) {
const projectNameCell = $(columns[1]); // 项目名称在第2列
const projectNameLink = projectNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目地址': $(columns[3]).text().trim(),
'项目链接': '', // 初始化项目链接字段
};
// 如果项目名称有链接,提取链接地址
if (projectNameLink.length) {
const onclickAttr = projectNameLink.attr('onclick');
if (onclickAttr) {
// 从onclick属性中提取链接
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
const origin = new URL(BASE_URL).origin;
rowData['项目链接'] = new URL(match[1], origin).href;
}
}
}
data.push(rowData);
}
});
return data;
}
/**
* 抓取网站所有数据,处理分页
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data);
allData = extractDataFromHtml($);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// 获取总页数
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// 收集表单数据用于POST请求
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// 抓取其余页面
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// 添加表单字段
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* 对抓取的数据进行后处理:
* 1. 重新编号序号字段
* @param {Array<Object>} allData - 原始抓取数据
* @returns {Promise<Array<Object>>} - 处理后的数据
*/
async function processScrapedData(allData) {
// 重新编号序号字段
for (let i = 0; i < allData.length; i++) {
allData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return allData;
}
// 主函数
async function main() {
try {
const allData = await scrapeWebsite();
const processedData = await processScrapedData(allData);
// 保存为JSON文件
await fs.writeFile('./data.json', JSON.stringify(processedData, null, 4), 'utf-8');
console.log('项目数据已保存至 data.json 文件。');
console.log('\n所有数据抓取和处理任务已完成。');
} catch (error) {
console.error('抓取或处理过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}
main();