import axios from 'axios'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'ProjectInfoList.aspx'; // XML Encode function xmlEncode(text) { return text.replace(/&/g, '&').replace(/ ${xmlEncode(queryString)} `; const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href; try { const response = await axios.post(encryptionUrl, xmlPayload, { headers: { 'Content-Type': 'application/xml', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } }); const encryptedQuery = response.data; if (encryptedQuery) { return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`; } } catch (error) { console.error(`加密链接失败: ${relativeUrl}`, error.message); } // Fallback to original url on error return new URL(relativeUrl, origin).href; } // 从HTML中提取表格数据的函数 async function extractDataFromHtml($, origin) { const rows = $('.resultlist table tr:has(td)').get(); const dataPromises = rows.map(async (row) => { const columns = $(row).find('td'); if (columns.length < 4) { return null; } const projectNameCell = $(columns[1]); const projectNameLink = projectNameCell.find('a'); const rowData = { '序号': $(columns[0]).text().trim(), '项目名称': projectNameLink.length ? projectNameLink.text().trim() : projectNameCell.text().trim(), '开发企业': $(columns[2]).text().trim(), '项目地址': $(columns[3]).text().trim(), '项目链接': '', }; if (projectNameLink.length) { const onclickAttr = projectNameLink.attr('onclick'); if (onclickAttr) { const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); if (match && match[1]) { rowData['项目链接'] = await getEncryptedUrl(match[1], origin); } } } return rowData; }); const data = await Promise.all(dataPromises); return data.filter(item => item !== null); } async function scrapeProjectDetails(project) { if (!project || !project['项目链接']) { return project; } try { console.log(`正在抓取详情: ${project['项目名称']}`); const response = await axios.get(project['项目链接'], { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); const $ = cheerio.load(response.data); // 抓取 housedetail 表格中的所有信息 $('.housedetail tr').each((i, row) => { const cells = $(row).find('td'); if (cells.length >= 2) { const key = $(cells[0]).text().trim().replace(':', '').replace(':', ''); const value = $(cells[1]).text().trim(); if (key) project[key] = value; } if (cells.length >= 4) { const key = $(cells[2]).text().trim().replace(':', '').replace(':', ''); const value = $(cells[3]).text().trim(); if (key) project[key] = value; } }); // 预售许可证 special handling, since it's more complex const licenses = []; const presellInfo = $('#presellInfo').val(); if (presellInfo) { const licenseItems = presellInfo.split(';;'); for (const item of licenseItems) { const parts = item.split(',,'); if (parts.length === 2 && parts[1]) { const licenseData = { '许可证号': parts[1].trim(), '楼幢': [] }; const licenseDiv = $(`.three:contains('${parts[1]}')`); const buildingSpans = licenseDiv.nextUntil('.three', 'span'); buildingSpans.each((i, span) => { const buildingText = $(span).text().trim(); const building = {}; const match = buildingText.match(/(.*)\(成交均价:(.*)\)/); if(match){ building['楼幢名称'] = match[1].trim(); building['成交均价'] = match[2].trim().replace(')',''); } else { building['楼幢名称'] = buildingText; } // Find the preceding input and get the 'bid' const bid = $(span).prev('input[name="radiobuild"]').attr('bid'); if (bid) { building['bid'] = bid.trim(); } licenseData['楼幢'].push(building); }); licenses.push(licenseData); } } } project['预售许可证'] = licenses; } catch (error) { console.error(`抓取详情失败: ${project['项目名称']}`, error.message); } return project; } /** * 抓取网站所有数据,处理分页 * @returns {Promise>} - 所有抓取的数据记录数组 */ async function scrapeWebsite() { let allData = []; console.log('开始抓取第一页数据...'); const origin = new URL(START_URL).origin; let response = await axios.get(START_URL, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); let $ = cheerio.load(response.data); let firstPageData = await extractDataFromHtml($, origin); allData = allData.concat(firstPageData); console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`); // 获取总页数 const pageCountSpan = $('#PageNavigator1_LblPageCount'); const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1; console.log(`共发现 ${totalPages} 页。`); // 收集表单数据,用于POST请求 const formValues = {}; $('input[name^="txt"], select').each((idx, el) => { const name = $(el).attr('name'); if (name) { formValues[name] = $(el).val() || ''; } }); // 抓取其余页面 for (let i = 2; i <= totalPages; i++) { console.log(`正在抓取第 ${i} 页...`); const viewState = $('#__VIEWSTATE').val(); const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); const eventValidation = $('#__EVENTVALIDATION').val(); if (!viewState) { console.log('无法找到 __VIEWSTATE,终止抓取。'); break; } const postData = new URLSearchParams(); postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); postData.append('__EVENTARGUMENT', ''); postData.append('__VIEWSTATE', viewState); postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); if (eventValidation) { postData.append('__EVENTVALIDATION', eventValidation); } // 添加表单字段 for (const name in formValues) { postData.append(name, formValues[name]); } postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); response = await axios.post(START_URL, postData, { headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': START_URL, } }); $ = cheerio.load(response.data); const nextPageData = await extractDataFromHtml($, origin); console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); if (nextPageData.length === 0) { console.log(`第 ${i} 页没有数据,抓取结束。`); break; } allData = allData.concat(nextPageData); } console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); console.log('\n开始抓取项目详情...'); const detailedDataPromises = allData.map(project => scrapeProjectDetails(project)); const detailedData = await Promise.all(detailedDataPromises); console.log('所有项目详情抓取完毕。'); return detailedData; } /** * 对抓取的数据进行后处理: * 1. 重新编号序号字段 * @param {Array} allData - 原始抓取数据 * @returns {Promise>} - 处理后的数据 */ async function processScrapedData(allData) { // 重新编号序号字段 for (let i = 0; i < allData.length; i++) { allData[i]['序号'] = (i + 1).toString(); } console.log('序号字段已重新编号。'); return allData; } // 主函数 - 导出以便在根目录调用 export async function main() { try { const allData = await scrapeWebsite(); const processedData = await processScrapedData(allData); // 保存为JSON文件 await fs.writeFile(path.join(__dirname, 'data.json'), JSON.stringify(processedData, null, 4), 'utf-8'); console.log('项目数据已保存至 data.json 文件。'); console.log('\n所有数据抓取和处理任务已完成。'); } catch (error) { console.error('抓取或处理过程中发生错误:', error.message); if (error.response) { console.error('Status:', error.response.status); } } }