import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { getEncryptedUrl } from './getEncryptedUrl.js'; import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'presellCertList.aspx'; // Extracts table data from a given HTML content async function extractDataFromHtml($, origin) { const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => { const columns = $(row).find('td'); // Based on debug.html, the structure is different and has 9 columns if (columns.length < 9) { return null; } const licenseCell = $(columns[1]); // 许可证号 is the 2nd column const licenseLinkTag = licenseCell.find('a'); const rowData = { '序号': $(columns[0]).text().trim(), '许可证号': licenseLinkTag.text().trim(), '开发企业': $(columns[2]).text().trim(), '项目名称': $(columns[3]).text().trim(), '项目地址': $(columns[4]).text().trim(), '批准时间': $(columns[5]).text().trim(), '所在区域': $(columns[6]).text().trim(), '总套数': $(columns[7]).text().trim(), '可售套数': $(columns[8]).text().trim(), '许可证链接': '', // Initialize }; // The link is inside an onclick attribute, not a standard href const onclickAttr = licenseLinkTag.attr('onclick'); if (onclickAttr) { // Make regex flexible to handle single or double quotes const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); if (match && match[1]) { rowData['许可证链接'] = await getEncryptedUrl(match[1], origin); } } return rowData; }); const data = await Promise.all(dataPromises); return data.filter(item => item !== null); } /** * Performs post-processing on the scraped data: * 1. Filters out records where "许可证号" is "空". * 2. Re-indexes "序号" sequentially using the utility function. * 3. Converts "总套数" and "可售套数" fields to numbers. * @param {Array} allData - The raw scraped data. * @returns {Promise>} - The processed data. */ async function processScrapedData(allData) { // 1. Filter out records where "许可证号" is "空" or null let processedData = allData.filter(record => record['许可证号'] !== '空' && record['许可证号']); console.log(`删除无效许可证记录后,剩余 ${processedData.length} 条记录。`); // 2. Re-index "序号" sequentially reIndex(processedData); // Use the shared utility function // 3. Convert "总套数" and "可售套数" to numbers for (const record of processedData) { record['总套数'] = parseInt(record['总套数'], 10) || 0; record['可售套数'] = parseInt(record['可售套数'], 10) || 0; } console.log('"总套数" 和 "可售套数" 字段已转换为数字。'); return processedData; } // 主函数 - 导出以便在根目录调用 export async function main() { try { console.log('🚀 开始抓取预售许可证数据...'); const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination const processedData = await processScrapedData(allData); const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json'); await fs.mkdir(path.dirname(dataPath), { recursive: true }); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); console.log(`✅ 更新后的数据已保存至 data/preSaleLicense.json 文件。`); console.log('\n所有数据处理和文件生成任务已完成。'); } catch (error) { console.error('在 getPreSaleLicense.js 抓取或处理过程中发生错误:', error.message); if (error.response) { console.error('Status:', error.response.status); } throw error; // Re-throw the error to be caught by the caller } }