puning-real-estate/scripts/getPreSaleLicense.js
2026-01-22 10:24:47 +08:00

175 lines
6.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx';
// Extracts table data from a given HTML content
function extractDataFromHtml($) {
const data = [];
// Corrected selector to find rows with table data, skipping the header
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td');
// Based on debug.html, the structure is different and has 9 columns
if (columns.length >= 9) {
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
const licenseLinkTag = licenseCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'许可证号': licenseLinkTag.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目名称': $(columns[3]).text().trim(),
'项目地址': $(columns[4]).text().trim(),
'批准时间': $(columns[5]).text().trim(),
'所在区域': $(columns[6]).text().trim(),
'总套数': $(columns[7]).text().trim(),
'可售套数': $(columns[8]).text().trim(),
'许可证链接': '', // Initialize
};
// The link is inside an onclick attribute, not a standard href
const onclickAttr = licenseLinkTag.attr('onclick');
if (onclickAttr) {
// Make regex flexible to handle single or double quotes
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
// We need to resolve it against the origin, not the full BASE_URL path
const origin = new URL(BASE_URL).origin;
rowData['许可证链接'] = new URL(match[1], origin).href;
}
}
data.push(rowData);
}
});
return data;
}
/**
* Scrapes all data from the target website, handling pagination.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
allData = extractDataFromHtml($);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if(eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Performs post-processing on the scraped data:
* 1. Filters out records where "许可证号" is "空".
* 2. Re-indexes "序号" sequentially.
* 3. Converts "总套数" and "可售套数" fields to numbers.
* @param {Array<Object>} allData - The raw scraped data.
* @returns {Promise<Array<Object>>} - The processed data.
*/
async function processScrapedData(allData) {
// 1. Filter out records where "许可证号" is "空"
let processedData = allData.filter(record => record['许可证号'] !== '空');
console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`);
// 2. Re-index "序号" sequentially
for (let i = 0; i < processedData.length; i++) {
processedData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
// 3. Convert "总套数" and "可售套数" to numbers
for (const record of processedData) {
record['总套数'] = parseInt(record['总套数'], 10) || 0;
record['可售套数'] = parseInt(record['可售套数'], 10) || 0;
}
console.log('"总套数" 和 "可售套数" 字段已转换为数字。');
return processedData;
}
// 主函数 - 导出以便在根目录调用
export async function main() {
try {
const allData = await scrapeWebsite();
const processedData = await processScrapedData(allData);
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
console.log('\n所有数据处理和文件生成任务已完成。');
} catch (error) {
console.error('抓取或处理过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}