puning-real-estate/scripts/getProject.js
2026-01-22 10:24:47 +08:00

284 lines
9.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
// XML Encode
function xmlEncode(text) {
return text.replace(/&/g, '&amp;').replace(/</g, '&lt;');
}
async function getEncryptedUrl(relativeUrl, origin) {
if (!relativeUrl) {
return '';
}
const urlParts = relativeUrl.split('?');
if (urlParts.length < 2) {
return new URL(relativeUrl, origin).href;
}
const path = urlParts[0];
const queryString = urlParts[1];
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
try {
const response = await axios.post(encryptionUrl, xmlPayload, {
headers: {
'Content-Type': 'application/xml',
}
});
const encryptedQuery = response.data;
if (encryptedQuery) {
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
}
} catch (error) {
console.error(`加密链接失败: ${relativeUrl}`, error.message);
}
// Fallback to original url on error
return new URL(relativeUrl, origin).href;
}
// 从HTML中提取表格数据的函数
async function extractDataFromHtml($, origin) {
const rows = $('.resultlist table tr:has(td)').get();
const dataPromises = rows.map(async (row) => {
const columns = $(row).find('td');
if (columns.length < 4) {
return null;
}
const projectNameCell = $(columns[1]);
const projectNameLink = projectNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目地址': $(columns[3]).text().trim(),
'项目链接': '',
};
if (projectNameLink.length) {
const onclickAttr = projectNameLink.attr('onclick');
if (onclickAttr) {
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
rowData['项目链接'] = await getEncryptedUrl(match[1], origin);
}
}
}
return rowData;
});
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
}
async function scrapeProjectDetails(project) {
if (!project || !project['项目链接']) {
return project;
}
try {
console.log(`正在抓取详情: ${project['项目名称']}`);
const response = await axios.get(project['项目链接']);
const $ = cheerio.load(response.data);
// 抓取 housedetail 表格中的所有信息
$('.housedetail tr').each((i, row) => {
const cells = $(row).find('td');
if (cells.length >= 2) {
const key = $(cells[0]).text().trim().replace('', '').replace(':', '');
const value = $(cells[1]).text().trim();
if (key) project[key] = value;
}
if (cells.length >= 4) {
const key = $(cells[2]).text().trim().replace('', '').replace(':', '');
const value = $(cells[3]).text().trim();
if (key) project[key] = value;
}
});
// 预售许可证 special handling, since it's more complex
const licenses = [];
const presellInfo = $('#presellInfo').val();
if (presellInfo) {
const licenseItems = presellInfo.split(';;');
for (const item of licenseItems) {
const parts = item.split(',,');
if (parts.length === 2 && parts[1]) {
const licenseData = {
'许可证号': parts[1].trim(),
'楼幢': []
};
const licenseDiv = $(`.three:contains('${parts[1]}')`);
const buildingSpans = licenseDiv.nextUntil('.three', 'span');
buildingSpans.each((i, span) => {
const buildingText = $(span).text().trim();
const building = {};
const match = buildingText.match(/(.*)\(成交均价:(.*)\)/);
if(match){
building['楼幢名称'] = match[1].trim();
building['成交均价'] = match[2].trim().replace(')','');
} else {
building['楼幢名称'] = buildingText;
}
// Find the preceding input and get the 'bid'
const bid = $(span).prev('input[name="radiobuild"]').attr('bid');
if (bid) {
building['bid'] = bid.trim();
}
licenseData['楼幢'].push(building);
});
licenses.push(licenseData);
}
}
}
project['预售许可证'] = licenses;
} catch (error) {
console.error(`抓取详情失败: ${project['项目名称']}`, error.message);
}
return project;
}
/**
* 抓取网站所有数据,处理分页
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
const origin = new URL(START_URL).origin;
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// 获取总页数
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// 收集表单数据用于POST请求
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// 抓取其余页面
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// 添加表单字段
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
console.log('\n开始抓取项目详情...');
const detailedDataPromises = allData.map(project => scrapeProjectDetails(project));
const detailedData = await Promise.all(detailedDataPromises);
console.log('所有项目详情抓取完毕。');
return detailedData;
}
/**
* 对抓取的数据进行后处理:
* 1. 重新编号序号字段
* @param {Array<Object>} allData - 原始抓取数据
* @returns {Promise<Array<Object>>} - 处理后的数据
*/
async function processScrapedData(allData) {
// 重新编号序号字段
for (let i = 0; i < allData.length; i++) {
allData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return allData;
}
// 主函数 - 导出以便在根目录调用
export async function main() {
try {
const allData = await scrapeWebsite();
const processedData = await processScrapedData(allData);
// 保存为JSON文件
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log('项目数据已保存至 data/project.json 文件。');
console.log('\n所有数据抓取和处理任务已完成。');
} catch (error) {
console.error('抓取或处理过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}