抓取项目详情

This commit is contained in:
秦秋旭 2026-01-21 20:54:35 +08:00
parent 3b0476f6eb
commit 200928e384
2 changed files with 8467 additions and 117 deletions

File diff suppressed because it is too large Load Diff

View File

@ -91,6 +91,83 @@ async function extractDataFromHtml($, origin) {
return data.filter(item => item !== null);
}
async function scrapeProjectDetails(project) {
if (!project || !project['项目链接']) {
return project;
}
try {
console.log(`正在抓取详情: ${project['项目名称']}`);
const response = await axios.get(project['项目链接'], {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(response.data);
// 抓取 housedetail 表格中的所有信息
$('.housedetail tr').each((i, row) => {
const cells = $(row).find('td');
if (cells.length >= 2) {
const key = $(cells[0]).text().trim().replace('', '').replace(':', '');
const value = $(cells[1]).text().trim();
if (key) project[key] = value;
}
if (cells.length >= 4) {
const key = $(cells[2]).text().trim().replace('', '').replace(':', '');
const value = $(cells[3]).text().trim();
if (key) project[key] = value;
}
});
// 预售许可证 special handling, since it's more complex
const licenses = [];
const presellInfo = $('#presellInfo').val();
if (presellInfo) {
const licenseItems = presellInfo.split(';;');
for (const item of licenseItems) {
const parts = item.split(',,');
if (parts.length === 2 && parts[1]) {
const licenseData = {
'许可证号': parts[1].trim(),
'楼幢': []
};
const licenseDiv = $(`.three:contains('${parts[1]}')`);
const buildingSpans = licenseDiv.nextUntil('.three', 'span');
buildingSpans.each((i, span) => {
const buildingText = $(span).text().trim();
const building = {};
const match = buildingText.match(/(.*)\(成交均价:(.*)\)/);
if(match){
building['楼幢名称'] = match[1].trim();
building['成交均价'] = match[2].trim().replace(')','');
} else {
building['楼幢名称'] = buildingText;
}
// Find the preceding input and get the 'bid'
const bid = $(span).prev('input[name="radiobuild"]').attr('bid');
if (bid) {
building['bid'] = bid.trim();
}
licenseData['楼幢'].push(building);
});
licenses.push(licenseData);
}
}
}
project['预售许可证'] = licenses;
} catch (error) {
console.error(`抓取详情失败: ${project['项目名称']}`, error.message);
}
return project;
}
/**
* 抓取网站所有数据处理分页
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
@ -173,7 +250,13 @@ async function scrapeWebsite() {
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
console.log('\n开始抓取项目详情...');
const detailedDataPromises = allData.map(project => scrapeProjectDetails(project));
const detailedData = await Promise.all(detailedDataPromises);
console.log('所有项目详情抓取完毕。');
return detailedData;
}
/**