import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import { getEncryptedUrl } from './getEncryptedUrl.js'; import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'ProjectInfoList.aspx'; // 从HTML中提取表格数据的函数 async function extractDataFromHtml($, origin) { const rows = $('.resultlist table tr:has(td)').get(); const dataPromises = rows.map(async (row) => { const columns = $(row).find('td'); if (columns.length < 4) { return null; } const projectNameCell = $(columns[1]); const projectNameLink = projectNameCell.find('a'); const rowData = { '序号': $(columns[0]).text().trim(), '项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(), '开发企业': $(columns[2]).text().trim(), '项目地址': $(columns[3]).text().trim(), '项目链接': '', }; if (projectNameLink.length) { const onclickAttr = projectNameLink.attr('onclick'); if (onclickAttr) { const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); if (match && match[1]) { rowData['项目链接'] = await getEncryptedUrl(match[1], origin); } } } return rowData; }); const data = await Promise.all(dataPromises); return data.filter(item => item !== null); } async function scrapeProjectDetails(project) { if (!project || !project['项目链接']) { return project; } try { console.log(`正在抓取详情: ${project['项目名称']}`); const response = await axios.get(project['项目链接']); const $ = cheerio.load(response.data); // 抓取 housedetail 表格中的所有信息 $('.housedetail tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length >= 2) { const key = $(cells[0]).text().trim().replace(':', '').replace(':', ''); const value = $(cells[1]).text().trim(); if (key) project[key] = value; } if (cells.length >= 4) { const key = $(cells[2]).text().trim().replace(':', '').replace(':', ''); const value = $(cells[3]).text().trim(); if (key) project[key] = value; } }); // 预售许可证 special handling, since it's more complex const licenses = []; const presellInfo = $('#presellInfo').val(); if (presellInfo) { const licenseItems = presellInfo.split(';;'); for (const item of licenseItems) { const parts = item.split(',,'); if (parts.length === 2 && parts[1]) { const licenseData = { '许可证号': parts[1].trim(), '楼幢': [] }; const licenseDiv = $(`.three:contains('${parts[1]}')`); const buildingSpans = licenseDiv.nextUntil('.three', 'span'); buildingSpans.each((i, span) => { const buildingText = $(span).text().trim(); const building = {}; const match = buildingText.match(/(.*)\(成交均价:(.*)\)/); if(match){ building['楼幢名称'] = match[1].trim(); building['成交均价'] = match[2].trim().replace(')',''); } else { building['楼幢名称'] = buildingText; } // Find the preceding input and get the 'bid' const bid = $(span).prev('input[name="radiobuild"]').attr('bid'); if (bid) { building['bid'] = bid.trim(); } licenseData['楼幢'].push(building); }); licenses.push(licenseData); } } } project['预售许可证'] = licenses; } catch (error) { console.error(`抓取详情失败: ${project['项目名称']}`, error.message); } return project; } // The old scrapeWebsite function is removed. // 主函数 - 导出以便在根目录调用 export async function main() { try { console.log('🚀 开始抓取项目数据...'); const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml); console.log('\n开始抓取项目详情...'); const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project)); const allData = await Promise.all(detailedDataPromises); console.log('所有项目详情抓取完毕。'); const processedData = reIndex(allData); // Use generic re-indexing // 保存为JSON文件 const dataPath = path.join(__dirname, '..', 'data', 'project.json'); await fs.mkdir(path.dirname(dataPath), { recursive: true }); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); console.log('项目数据已保存至 data/project.json 文件。'); console.log('\n所有数据抓取和处理任务已完成。'); } catch (error) { console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message); if (error.response) { console.error('Status:', error.response.status); } throw error; // Re-throw the error to be caught by the caller } }