import axios from 'axios'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'presellCertList.aspx'; // Extracts table data from a given HTML content function extractDataFromHtml($) { const data = []; // Corrected selector to find rows with table data, skipping the header const rows = $('.resultlist table tr:has(td)'); rows.each((i, row) => { const columns = $(row).find('td'); // Based on debug.html, the structure is different and has 9 columns if (columns.length >= 9) { const licenseCell = $(columns[1]); // 许可证号 is the 2nd column const licenseLinkTag = licenseCell.find('a'); const rowData = { '序号': $(columns[0]).text().trim(), '许可证号': licenseLinkTag.text().trim(), '开发企业': $(columns[2]).text().trim(), '项目名称': $(columns[3]).text().trim(), '项目地址': $(columns[4]).text().trim(), '批准时间': $(columns[5]).text().trim(), '所在区域': $(columns[6]).text().trim(), '总套数': $(columns[7]).text().trim(), '可售套数': $(columns[8]).text().trim(), '许可证链接': '', // Initialize }; // The link is inside an onclick attribute, not a standard href const onclickAttr = licenseLinkTag.attr('onclick'); if (onclickAttr) { // Make regex flexible to handle single or double quotes const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); if (match && match[1]) { // match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110' // We need to resolve it against the origin, not the full BASE_URL path const origin = new URL(BASE_URL).origin; rowData['许可证链接'] = new URL(match[1], origin).href; } } data.push(rowData); } }); return data; } /** * Scrapes all data from the target website, handling pagination. * @returns {Promise>} - An array of all scraped data records. */ async function scrapeWebsite() { let allData = []; console.log('开始抓取第一页数据...'); let response = await axios.get(START_URL, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); let $ = cheerio.load(response.data); allData = extractDataFromHtml($); console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`); const pageCountSpan = $('#PageNavigator1_LblPageCount'); const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1; console.log(`共发现 ${totalPages} 页。`); const formValues = {}; $('input[name^="txt"], select').each((idx, el) => { const name = $(el).attr('name'); if (name) { formValues[name] = $(el).val() || ''; } }); for (let i = 2; i <= totalPages; i++) { console.log(`正在抓取第 ${i} 页...`); const viewState = $('#__VIEWSTATE').val(); const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); const eventValidation = $('#__EVENTVALIDATION').val(); if (!viewState) { console.log('无法找到 __VIEWSTATE,终止抓取。'); break; } const postData = new URLSearchParams(); postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); postData.append('__EVENTARGUMENT', ''); postData.append('__VIEWSTATE', viewState); postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); if(eventValidation) { postData.append('__EVENTVALIDATION', eventValidation); } for (const name in formValues) { postData.append(name, formValues[name]); } postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); response = await axios.post(START_URL, postData, { headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': START_URL, } }); $ = cheerio.load(response.data); const nextPageData = extractDataFromHtml($); console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); if (nextPageData.length === 0) { console.log(`第 ${i} 页没有数据,抓取结束。`); break; } allData = allData.concat(nextPageData); } console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); return allData; } /** * Performs post-processing on the scraped data: * 1. Filters out records where "许可证号" is "空". * 2. Re-indexes "序号" sequentially. * 3. Converts "总套数" and "可售套数" fields to numbers. * @param {Array} allData - The raw scraped data. * @returns {Promise>} - The processed data. */ async function processScrapedData(allData) { // 1. Filter out records where "许可证号" is "空" let processedData = allData.filter(record => record['许可证号'] !== '空'); console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`); // 2. Re-index "序号" sequentially for (let i = 0; i < processedData.length; i++) { processedData[i]['序号'] = (i + 1).toString(); } console.log('序号字段已重新编号。'); // 3. Convert "总套数" and "可售套数" to numbers for (const record of processedData) { record['总套数'] = parseInt(record['总套数'], 10) || 0; record['可售套数'] = parseInt(record['可售套数'], 10) || 0; } console.log('"总套数" 和 "可售套数" 字段已转换为数字。'); return processedData; } /** * Generates a CSV file from the given data. * @param {Array} data - The data to convert to CSV. * @param {string} filePath - The path to save the CSV file. */ async function generateCsv(data, filePath) { if (data.length === 0) { console.log(`没有数据可生成 CSV 文件 (${filePath})。`); return; } const headers = Object.keys(data[0]); const csvRows = []; csvRows.push(headers.map(header => `"${header}"`).join(',')); for (const record of data) { const values = headers.map(header => { const value = record[header]; // Ensure values are properly quoted and internal quotes are escaped return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`; }); csvRows.push(values.join(',')); } const csvContent = csvRows.join('\n'); const BOM = '\uFEFF'; // UTF-8 BOM await fs.writeFile(filePath, BOM + csvContent, 'utf-8'); console.log(`已生成 ${filePath} 文件。`); } /** * Generates an XLSX file from the given data. * @param {Array} data - The data to convert to XLSX. * @param {string} filePath - The path to save the XLSX file. */ async function generateXlsx(data, filePath) { if (data.length === 0) { console.log(`没有数据可生成 XLSX 文件 (${filePath})。`); return; } const workbook = new ExcelJS.Workbook(); const worksheet = workbook.addWorksheet('Data'); const headers = Object.keys(data[0]); worksheet.columns = headers.map(key => ({ header: key, key: key, width: key.includes('地址') || key.includes('链接') ? 40 : 20 })); worksheet.addRows(data); worksheet.getRow(1).eachCell(cell => { cell.font = { bold: true }; cell.fill = { type: 'pattern', pattern:'solid', fgColor:{argb:'FFDDDDDD'} }; cell.alignment = { vertical: 'middle', horizontal: 'center' }; }); worksheet.autoFilter = { from: 'A1', to: { row: 1, column: headers.length } }; await workbook.xlsx.writeFile(filePath); console.log(`已生成 ${filePath} 文件。`); } async function main() { try { const allData = await scrapeWebsite(); const processedData = await processScrapedData(allData); await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8'); console.log(`更新后的数据已保存至 data.json 文件。`); await generateCsv(processedData, './data.csv'); await generateXlsx(processedData, './data.xlsx'); console.log('\n所有数据处理和文件生成任务已完成。'); } catch (error) { console.error('抓取或处理过程中发生错误:', error.message); if (error.response) { console.error('Status:', error.response.status); } } } main();