252 lines
9.1 KiB
JavaScript
252 lines
9.1 KiB
JavaScript
import axios from 'axios';
|
||
import * as cheerio from 'cheerio';
|
||
import fs from 'fs/promises';
|
||
import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation
|
||
|
||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||
|
||
// Extracts table data from a given HTML content
|
||
function extractDataFromHtml($) {
|
||
const data = [];
|
||
// Corrected selector to find rows with table data, skipping the header
|
||
const rows = $('.resultlist table tr:has(td)');
|
||
|
||
rows.each((i, row) => {
|
||
const columns = $(row).find('td');
|
||
// Based on debug.html, the structure is different and has 9 columns
|
||
if (columns.length >= 9) {
|
||
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
||
const licenseLinkTag = licenseCell.find('a');
|
||
|
||
const rowData = {
|
||
'序号': $(columns[0]).text().trim(),
|
||
'许可证号': licenseLinkTag.text().trim(),
|
||
'开发企业': $(columns[2]).text().trim(),
|
||
'项目名称': $(columns[3]).text().trim(),
|
||
'项目地址': $(columns[4]).text().trim(),
|
||
'批准时间': $(columns[5]).text().trim(),
|
||
'所在区域': $(columns[6]).text().trim(),
|
||
'总套数': $(columns[7]).text().trim(),
|
||
'可售套数': $(columns[8]).text().trim(),
|
||
'许可证链接': '', // Initialize
|
||
};
|
||
|
||
// The link is inside an onclick attribute, not a standard href
|
||
const onclickAttr = licenseLinkTag.attr('onclick');
|
||
if (onclickAttr) {
|
||
// Make regex flexible to handle single or double quotes
|
||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||
if (match && match[1]) {
|
||
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
|
||
// We need to resolve it against the origin, not the full BASE_URL path
|
||
const origin = new URL(BASE_URL).origin;
|
||
rowData['许可证链接'] = new URL(match[1], origin).href;
|
||
}
|
||
}
|
||
data.push(rowData);
|
||
}
|
||
});
|
||
return data;
|
||
}
|
||
|
||
/**
|
||
* Scrapes all data from the target website, handling pagination.
|
||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||
*/
|
||
async function scrapeWebsite() {
|
||
let allData = [];
|
||
console.log('开始抓取第一页数据...');
|
||
let response = await axios.get(START_URL, {
|
||
headers: {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||
}
|
||
});
|
||
|
||
let $ = cheerio.load(response.data);
|
||
allData = extractDataFromHtml($);
|
||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||
|
||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
|
||
console.log(`共发现 ${totalPages} 页。`);
|
||
|
||
const formValues = {};
|
||
$('input[name^="txt"], select').each((idx, el) => {
|
||
const name = $(el).attr('name');
|
||
if (name) {
|
||
formValues[name] = $(el).val() || '';
|
||
}
|
||
});
|
||
|
||
for (let i = 2; i <= totalPages; i++) {
|
||
console.log(`正在抓取第 ${i} 页...`);
|
||
|
||
const viewState = $('#__VIEWSTATE').val();
|
||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||
|
||
if (!viewState) {
|
||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||
break;
|
||
}
|
||
|
||
const postData = new URLSearchParams();
|
||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||
postData.append('__EVENTARGUMENT', '');
|
||
postData.append('__VIEWSTATE', viewState);
|
||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||
if(eventValidation) {
|
||
postData.append('__EVENTVALIDATION', eventValidation);
|
||
}
|
||
|
||
for (const name in formValues) {
|
||
postData.append(name, formValues[name]);
|
||
}
|
||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||
|
||
response = await axios.post(START_URL, postData, {
|
||
headers: {
|
||
'Content-Type': 'application/x-www-form-urlencoded',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||
'Referer': START_URL,
|
||
}
|
||
});
|
||
|
||
$ = cheerio.load(response.data);
|
||
const nextPageData = extractDataFromHtml($);
|
||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||
|
||
if (nextPageData.length === 0) {
|
||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||
break;
|
||
}
|
||
allData = allData.concat(nextPageData);
|
||
}
|
||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`);
|
||
return allData;
|
||
}
|
||
|
||
|
||
/**
|
||
* Performs post-processing on the scraped data:
|
||
* 1. Filters out records where "许可证号" is "空".
|
||
* 2. Re-indexes "序号" sequentially.
|
||
* 3. Converts "总套数" and "可售套数" fields to numbers.
|
||
* @param {Array<Object>} allData - The raw scraped data.
|
||
* @returns {Promise<Array<Object>>} - The processed data.
|
||
*/
|
||
async function processScrapedData(allData) {
|
||
// 1. Filter out records where "许可证号" is "空"
|
||
let processedData = allData.filter(record => record['许可证号'] !== '空');
|
||
console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`);
|
||
|
||
// 2. Re-index "序号" sequentially
|
||
for (let i = 0; i < processedData.length; i++) {
|
||
processedData[i]['序号'] = (i + 1).toString();
|
||
}
|
||
console.log('序号字段已重新编号。');
|
||
|
||
// 3. Convert "总套数" and "可售套数" to numbers
|
||
for (const record of processedData) {
|
||
record['总套数'] = parseInt(record['总套数'], 10) || 0;
|
||
record['可售套数'] = parseInt(record['可售套数'], 10) || 0;
|
||
}
|
||
console.log('"总套数" 和 "可售套数" 字段已转换为数字。');
|
||
|
||
return processedData;
|
||
}
|
||
|
||
/**
|
||
* Generates a CSV file from the given data.
|
||
* @param {Array<Object>} data - The data to convert to CSV.
|
||
* @param {string} filePath - The path to save the CSV file.
|
||
*/
|
||
async function generateCsv(data, filePath) {
|
||
if (data.length === 0) {
|
||
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
|
||
return;
|
||
}
|
||
const headers = Object.keys(data[0]);
|
||
const csvRows = [];
|
||
csvRows.push(headers.map(header => `"${header}"`).join(','));
|
||
for (const record of data) {
|
||
const values = headers.map(header => {
|
||
const value = record[header];
|
||
// Ensure values are properly quoted and internal quotes are escaped
|
||
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
|
||
});
|
||
csvRows.push(values.join(','));
|
||
}
|
||
const csvContent = csvRows.join('\n');
|
||
const BOM = '\uFEFF'; // UTF-8 BOM
|
||
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
|
||
console.log(`已生成 ${filePath} 文件。`);
|
||
}
|
||
|
||
/**
|
||
* Generates an XLSX file from the given data.
|
||
* @param {Array<Object>} data - The data to convert to XLSX.
|
||
* @param {string} filePath - The path to save the XLSX file.
|
||
*/
|
||
async function generateXlsx(data, filePath) {
|
||
if (data.length === 0) {
|
||
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
|
||
return;
|
||
}
|
||
const workbook = new ExcelJS.Workbook();
|
||
const worksheet = workbook.addWorksheet('Data');
|
||
|
||
const headers = Object.keys(data[0]);
|
||
worksheet.columns = headers.map(key => ({
|
||
header: key,
|
||
key: key,
|
||
width: key.includes('地址') || key.includes('链接') ? 40 : 20
|
||
}));
|
||
worksheet.addRows(data);
|
||
|
||
worksheet.getRow(1).eachCell(cell => {
|
||
cell.font = { bold: true };
|
||
cell.fill = {
|
||
type: 'pattern',
|
||
pattern:'solid',
|
||
fgColor:{argb:'FFDDDDDD'}
|
||
};
|
||
cell.alignment = { vertical: 'middle', horizontal: 'center' };
|
||
});
|
||
worksheet.autoFilter = {
|
||
from: 'A1',
|
||
to: {
|
||
row: 1,
|
||
column: headers.length
|
||
}
|
||
};
|
||
await workbook.xlsx.writeFile(filePath);
|
||
console.log(`已生成 ${filePath} 文件。`);
|
||
}
|
||
|
||
|
||
async function main() {
|
||
try {
|
||
const allData = await scrapeWebsite();
|
||
|
||
const processedData = await processScrapedData(allData);
|
||
|
||
await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8');
|
||
console.log(`更新后的数据已保存至 data.json 文件。`);
|
||
|
||
await generateCsv(processedData, './data.csv');
|
||
await generateXlsx(processedData, './data.xlsx');
|
||
|
||
console.log('\n所有数据处理和文件生成任务已完成。');
|
||
|
||
} catch (error) {
|
||
console.error('抓取或处理过程中发生错误:', error.message);
|
||
if (error.response) {
|
||
console.error('Status:', error.response.status);
|
||
}
|
||
}
|
||
}
|
||
|
||
main();
|