puning-real-estate/PreSaleLicense/index.js
2026-01-21 17:39:54 +08:00

252 lines
9.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx';
// Extracts table data from a given HTML content
function extractDataFromHtml($) {
const data = [];
// Corrected selector to find rows with table data, skipping the header
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td');
// Based on debug.html, the structure is different and has 9 columns
if (columns.length >= 9) {
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
const licenseLinkTag = licenseCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'许可证号': licenseLinkTag.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目名称': $(columns[3]).text().trim(),
'项目地址': $(columns[4]).text().trim(),
'批准时间': $(columns[5]).text().trim(),
'所在区域': $(columns[6]).text().trim(),
'总套数': $(columns[7]).text().trim(),
'可售套数': $(columns[8]).text().trim(),
'许可证链接': '', // Initialize
};
// The link is inside an onclick attribute, not a standard href
const onclickAttr = licenseLinkTag.attr('onclick');
if (onclickAttr) {
// Make regex flexible to handle single or double quotes
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
// We need to resolve it against the origin, not the full BASE_URL path
const origin = new URL(BASE_URL).origin;
rowData['许可证链接'] = new URL(match[1], origin).href;
}
}
data.push(rowData);
}
});
return data;
}
/**
* Scrapes all data from the target website, handling pagination.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
async function scrapeWebsite() {
let allData = [];
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data);
allData = extractDataFromHtml($);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if(eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Performs post-processing on the scraped data:
* 1. Filters out records where "许可证号" is "空".
* 2. Re-indexes "序号" sequentially.
* 3. Converts "总套数" and "可售套数" fields to numbers.
* @param {Array<Object>} allData - The raw scraped data.
* @returns {Promise<Array<Object>>} - The processed data.
*/
async function processScrapedData(allData) {
// 1. Filter out records where "许可证号" is "空"
let processedData = allData.filter(record => record['许可证号'] !== '空');
console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`);
// 2. Re-index "序号" sequentially
for (let i = 0; i < processedData.length; i++) {
processedData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
// 3. Convert "总套数" and "可售套数" to numbers
for (const record of processedData) {
record['总套数'] = parseInt(record['总套数'], 10) || 0;
record['可售套数'] = parseInt(record['可售套数'], 10) || 0;
}
console.log('"总套数" 和 "可售套数" 字段已转换为数字。');
return processedData;
}
/**
* Generates a CSV file from the given data.
* @param {Array<Object>} data - The data to convert to CSV.
* @param {string} filePath - The path to save the CSV file.
*/
async function generateCsv(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
return;
}
const headers = Object.keys(data[0]);
const csvRows = [];
csvRows.push(headers.map(header => `"${header}"`).join(','));
for (const record of data) {
const values = headers.map(header => {
const value = record[header];
// Ensure values are properly quoted and internal quotes are escaped
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
});
csvRows.push(values.join(','));
}
const csvContent = csvRows.join('\n');
const BOM = '\uFEFF'; // UTF-8 BOM
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
console.log(`已生成 ${filePath} 文件。`);
}
/**
* Generates an XLSX file from the given data.
* @param {Array<Object>} data - The data to convert to XLSX.
* @param {string} filePath - The path to save the XLSX file.
*/
async function generateXlsx(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
return;
}
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('Data');
const headers = Object.keys(data[0]);
worksheet.columns = headers.map(key => ({
header: key,
key: key,
width: key.includes('地址') || key.includes('链接') ? 40 : 20
}));
worksheet.addRows(data);
worksheet.getRow(1).eachCell(cell => {
cell.font = { bold: true };
cell.fill = {
type: 'pattern',
pattern:'solid',
fgColor:{argb:'FFDDDDDD'}
};
cell.alignment = { vertical: 'middle', horizontal: 'center' };
});
worksheet.autoFilter = {
from: 'A1',
to: {
row: 1,
column: headers.length
}
};
await workbook.xlsx.writeFile(filePath);
console.log(`已生成 ${filePath} 文件。`);
}
async function main() {
try {
const allData = await scrapeWebsite();
const processedData = await processScrapedData(allData);
await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`更新后的数据已保存至 data.json 文件。`);
await generateCsv(processedData, './data.csv');
await generateXlsx(processedData, './data.xlsx');
console.log('\n所有数据处理和文件生成任务已完成。');
} catch (error) {
console.error('抓取或处理过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}
main();