抓取项目详情
This commit is contained in:
parent
3b0476f6eb
commit
200928e384
8499
Project/data.json
8499
Project/data.json
File diff suppressed because it is too large
Load Diff
@ -91,6 +91,83 @@ async function extractDataFromHtml($, origin) {
|
||||
return data.filter(item => item !== null);
|
||||
}
|
||||
|
||||
async function scrapeProjectDetails(project) {
|
||||
if (!project || !project['项目链接']) {
|
||||
return project;
|
||||
}
|
||||
|
||||
try {
|
||||
console.log(`正在抓取详情: ${project['项目名称']}`);
|
||||
const response = await axios.get(project['项目链接'], {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
// 抓取 housedetail 表格中的所有信息
|
||||
$('.housedetail tr').each((i, row) => {
|
||||
const cells = $(row).find('td');
|
||||
if (cells.length >= 2) {
|
||||
const key = $(cells[0]).text().trim().replace(':', '').replace(':', '');
|
||||
const value = $(cells[1]).text().trim();
|
||||
if (key) project[key] = value;
|
||||
}
|
||||
if (cells.length >= 4) {
|
||||
const key = $(cells[2]).text().trim().replace(':', '').replace(':', '');
|
||||
const value = $(cells[3]).text().trim();
|
||||
if (key) project[key] = value;
|
||||
}
|
||||
});
|
||||
|
||||
// 预售许可证 special handling, since it's more complex
|
||||
const licenses = [];
|
||||
const presellInfo = $('#presellInfo').val();
|
||||
if (presellInfo) {
|
||||
const licenseItems = presellInfo.split(';;');
|
||||
for (const item of licenseItems) {
|
||||
const parts = item.split(',,');
|
||||
if (parts.length === 2 && parts[1]) {
|
||||
const licenseData = {
|
||||
'许可证号': parts[1].trim(),
|
||||
'楼幢': []
|
||||
};
|
||||
|
||||
const licenseDiv = $(`.three:contains('${parts[1]}')`);
|
||||
const buildingSpans = licenseDiv.nextUntil('.three', 'span');
|
||||
|
||||
buildingSpans.each((i, span) => {
|
||||
const buildingText = $(span).text().trim();
|
||||
const building = {};
|
||||
const match = buildingText.match(/(.*)\(成交均价:(.*)\)/);
|
||||
if(match){
|
||||
building['楼幢名称'] = match[1].trim();
|
||||
building['成交均价'] = match[2].trim().replace(')','');
|
||||
} else {
|
||||
building['楼幢名称'] = buildingText;
|
||||
}
|
||||
|
||||
// Find the preceding input and get the 'bid'
|
||||
const bid = $(span).prev('input[name="radiobuild"]').attr('bid');
|
||||
if (bid) {
|
||||
building['bid'] = bid.trim();
|
||||
}
|
||||
|
||||
licenseData['楼幢'].push(building);
|
||||
});
|
||||
licenses.push(licenseData);
|
||||
}
|
||||
}
|
||||
}
|
||||
project['预售许可证'] = licenses;
|
||||
|
||||
} catch (error) {
|
||||
console.error(`抓取详情失败: ${project['项目名称']}`, error.message);
|
||||
}
|
||||
return project;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 抓取网站所有数据,处理分页
|
||||
* @returns {Promise<Array<Object>>} - 所有抓取的数据记录数组
|
||||
@ -173,7 +250,13 @@ async function scrapeWebsite() {
|
||||
}
|
||||
|
||||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||||
return allData;
|
||||
|
||||
console.log('\n开始抓取项目详情...');
|
||||
const detailedDataPromises = allData.map(project => scrapeProjectDetails(project));
|
||||
const detailedData = await Promise.all(detailedDataPromises);
|
||||
console.log('所有项目详情抓取完毕。');
|
||||
|
||||
return detailedData;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Loading…
Reference in New Issue
Block a user