157 lines
5.9 KiB
JavaScript
157 lines
5.9 KiB
JavaScript
import axios from './axios.js';
|
||
import * as cheerio from 'cheerio';
|
||
import fs from 'fs/promises';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url';
|
||
|
||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
||
|
||
const __filename = fileURLToPath(import.meta.url);
|
||
const __dirname = path.dirname(__filename);
|
||
|
||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
|
||
|
||
// 从HTML中提取表格数据的函数
|
||
async function extractDataFromHtml($, origin) {
|
||
const rows = $('.resultlist table tr:has(td)').get();
|
||
|
||
const dataPromises = rows.map(async (row) => {
|
||
const columns = $(row).find('td');
|
||
if (columns.length < 4) {
|
||
return null;
|
||
}
|
||
|
||
const projectNameCell = $(columns[1]);
|
||
const projectNameLink = projectNameCell.find('a');
|
||
|
||
const rowData = {
|
||
'序号': $(columns[0]).text().trim(),
|
||
'项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(),
|
||
'开发企业': $(columns[2]).text().trim(),
|
||
'项目地址': $(columns[3]).text().trim(),
|
||
'项目链接': '',
|
||
};
|
||
|
||
if (projectNameLink.length) {
|
||
const onclickAttr = projectNameLink.attr('onclick');
|
||
if (onclickAttr) {
|
||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||
if (match && match[1]) {
|
||
rowData['项目链接'] = await getEncryptedUrl(match[1], origin);
|
||
}
|
||
}
|
||
}
|
||
return rowData;
|
||
});
|
||
|
||
const data = await Promise.all(dataPromises);
|
||
return data.filter(item => item !== null);
|
||
}
|
||
|
||
async function scrapeProjectDetails(project) {
|
||
if (!project || !project['项目链接']) {
|
||
return project;
|
||
}
|
||
|
||
try {
|
||
console.log(`正在抓取详情: ${project['项目名称']}`);
|
||
const response = await axios.get(project['项目链接']);
|
||
const $ = cheerio.load(response.data);
|
||
|
||
// 抓取 housedetail 表格中的所有信息
|
||
$('.housedetail tr').each((i, row) => {
|
||
const cells = $(row).find('td');
|
||
if (cells.length >= 2) {
|
||
const key = $(cells[0]).text().trim().replace(':', '').replace(':', '');
|
||
const value = $(cells[1]).text().trim();
|
||
if (key) project[key] = value;
|
||
}
|
||
if (cells.length >= 4) {
|
||
const key = $(cells[2]).text().trim().replace(':', '').replace(':', '');
|
||
const value = $(cells[3]).text().trim();
|
||
if (key) project[key] = value;
|
||
}
|
||
});
|
||
|
||
// 预售许可证 special handling, since it's more complex
|
||
const licenses = [];
|
||
const presellInfo = $('#presellInfo').val();
|
||
if (presellInfo) {
|
||
const licenseItems = presellInfo.split(';;');
|
||
for (const item of licenseItems) {
|
||
const parts = item.split(',,');
|
||
if (parts.length === 2 && parts[1]) {
|
||
const licenseData = {
|
||
'许可证号': parts[1].trim(),
|
||
'楼幢': []
|
||
};
|
||
|
||
const licenseDiv = $(`.three:contains('${parts[1]}')`);
|
||
const buildingSpans = licenseDiv.nextUntil('.three', 'span');
|
||
|
||
buildingSpans.each((i, span) => {
|
||
const buildingText = $(span).text().trim();
|
||
const building = {};
|
||
const match = buildingText.match(/(.*)\(成交均价:(.*)\)/);
|
||
if(match){
|
||
building['楼幢名称'] = match[1].trim();
|
||
building['成交均价'] = match[2].trim().replace(')','');
|
||
} else {
|
||
building['楼幢名称'] = buildingText;
|
||
}
|
||
|
||
// Find the preceding input and get the 'bid'
|
||
const bid = $(span).prev('input[name="radiobuild"]').attr('bid');
|
||
if (bid) {
|
||
building['bid'] = bid.trim();
|
||
}
|
||
|
||
licenseData['楼幢'].push(building);
|
||
});
|
||
licenses.push(licenseData);
|
||
}
|
||
}
|
||
}
|
||
project['预售许可证'] = licenses;
|
||
|
||
} catch (error) {
|
||
console.error(`抓取详情失败: ${project['项目名称']}`, error.message);
|
||
}
|
||
return project;
|
||
}
|
||
|
||
|
||
// The old scrapeWebsite function is removed.
|
||
|
||
// 主函数 - 导出以便在根目录调用
|
||
export async function main() {
|
||
try {
|
||
console.log('🚀 开始抓取项目数据...');
|
||
const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml);
|
||
|
||
console.log('\n开始抓取项目详情...');
|
||
const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project));
|
||
const allData = await Promise.all(detailedDataPromises);
|
||
console.log('所有项目详情抓取完毕。');
|
||
|
||
const processedData = reIndex(allData); // Use generic re-indexing
|
||
|
||
// 保存为JSON文件
|
||
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
||
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||
console.log('项目数据已保存至 data/project.json 文件。');
|
||
|
||
console.log('\n所有数据抓取和处理任务已完成。');
|
||
|
||
} catch (error) {
|
||
console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message);
|
||
if (error.response) {
|
||
console.error('Status:', error.response.status);
|
||
}
|
||
throw error; // Re-throw the error to be caught by the caller
|
||
}
|
||
}
|