puning-real-estate/scripts/getProject.js
2026-01-22 16:04:40 +08:00

157 lines
5.9 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { getEncryptedUrl } from './getEncryptedUrl.js';
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
// 从HTML中提取表格数据的函数
async function extractDataFromHtml($, origin) {
const rows = $('.resultlist table tr:has(td)').get();
const dataPromises = rows.map(async (row) => {
const columns = $(row).find('td');
if (columns.length < 4) {
return null;
}
const projectNameCell = $(columns[1]);
const projectNameLink = projectNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目地址': $(columns[3]).text().trim(),
'项目链接': '',
};
if (projectNameLink.length) {
const onclickAttr = projectNameLink.attr('onclick');
if (onclickAttr) {
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
rowData['项目链接'] = await getEncryptedUrl(match[1], origin);
}
}
}
return rowData;
});
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
}
async function scrapeProjectDetails(project) {
if (!project || !project['项目链接']) {
return project;
}
try {
console.log(`正在抓取详情: ${project['项目名称']}`);
const response = await axios.get(project['项目链接']);
const $ = cheerio.load(response.data);
// 抓取 housedetail 表格中的所有信息
$('.housedetail tr').each((i, row) => {
const cells = $(row).find('td');
if (cells.length >= 2) {
const key = $(cells[0]).text().trim().replace('', '').replace(':', '');
const value = $(cells[1]).text().trim();
if (key) project[key] = value;
}
if (cells.length >= 4) {
const key = $(cells[2]).text().trim().replace('', '').replace(':', '');
const value = $(cells[3]).text().trim();
if (key) project[key] = value;
}
});
// 预售许可证 special handling, since it's more complex
const licenses = [];
const presellInfo = $('#presellInfo').val();
if (presellInfo) {
const licenseItems = presellInfo.split(';;');
for (const item of licenseItems) {
const parts = item.split(',,');
if (parts.length === 2 && parts[1]) {
const licenseData = {
'许可证号': parts[1].trim(),
'楼幢': []
};
const licenseDiv = $(`.three:contains('${parts[1]}')`);
const buildingSpans = licenseDiv.nextUntil('.three', 'span');
buildingSpans.each((i, span) => {
const buildingText = $(span).text().trim();
const building = {};
const match = buildingText.match(/(.*)\(成交均价:(.*)\)/);
if(match){
building['楼幢名称'] = match[1].trim();
building['成交均价'] = match[2].trim().replace(')','');
} else {
building['楼幢名称'] = buildingText;
}
// Find the preceding input and get the 'bid'
const bid = $(span).prev('input[name="radiobuild"]').attr('bid');
if (bid) {
building['bid'] = bid.trim();
}
licenseData['楼幢'].push(building);
});
licenses.push(licenseData);
}
}
}
project['预售许可证'] = licenses;
} catch (error) {
console.error(`抓取详情失败: ${project['项目名称']}`, error.message);
}
return project;
}
// The old scrapeWebsite function is removed.
// 主函数 - 导出以便在根目录调用
export async function main() {
try {
console.log('🚀 开始抓取项目数据...');
const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml);
console.log('\n开始抓取项目详情...');
const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project));
const allData = await Promise.all(detailedDataPromises);
console.log('所有项目详情抓取完毕。');
const processedData = reIndex(allData); // Use generic re-indexing
// 保存为JSON文件
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true });
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log('项目数据已保存至 data/project.json 文件。');
console.log('\n所有数据抓取和处理任务已完成。');
} catch (error) {
console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
throw error; // Re-throw the error to be caught by the caller
}
}