puning-real-estate/scraper.js
2026-01-18 14:09:29 +08:00

139 lines
5.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx';
// Extracts table data from a given HTML content
function extractDataFromHtml($) {
const data = [];
// Corrected selector to find rows with table data, skipping the header
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td');
// Based on debug.html, the structure is different and has 9 columns
if (columns.length >= 9) {
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
const licenseLinkTag = licenseCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'许可证号': licenseLinkTag.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目名称': $(columns[3]).text().trim(),
'项目地址': $(columns[4]).text().trim(),
'批准时间': $(columns[5]).text().trim(),
'所在区域': $(columns[6]).text().trim(),
'总套数': $(columns[7]).text().trim(),
'可售套数': $(columns[8]).text().trim(),
'许可证链接': '', // Initialize
};
// The link is inside an onclick attribute, not a standard href
const onclickAttr = licenseLinkTag.attr('onclick');
if (onclickAttr) {
// Make regex flexible to handle single or double quotes
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
// We need to resolve it against the origin, not the full BASE_URL path
const origin = new URL(BASE_URL).origin;
rowData['许可证链接'] = new URL(match[1], origin).href;
}
}
data.push(rowData);
}
});
return data;
}
async function main() {
try {
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data);
let allData = extractDataFromHtml($);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// Find the total number of pages from the correct element
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// Get initial search form values to persist them across requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Loop from the second page to the end
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
// Get the required form fields from the CURRENT page's response
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if(eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// Append the initial form values
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
await fs.writeFile('data.json', JSON.stringify(allData, null, 4), 'utf-8');
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条记录已保存至 data.json 文件。`);
} catch (error) {
console.error('抓取过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}
main();