100 lines
4.2 KiB
JavaScript
100 lines
4.2 KiB
JavaScript
import axios from './axios.js';
|
|
import * as cheerio from 'cheerio';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
|
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
|
|
|
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
|
const START_URL = BASE_URL + 'presellCertList.aspx';
|
|
|
|
// Extracts table data from a given HTML content
|
|
async function extractDataFromHtml($, origin) {
|
|
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
|
|
const columns = $(row).find('td');
|
|
// Based on debug.html, the structure is different and has 9 columns
|
|
if (columns.length < 9) {
|
|
return null;
|
|
}
|
|
|
|
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
|
const licenseLinkTag = licenseCell.find('a');
|
|
|
|
const rowData = {
|
|
'序号': $(columns[0]).text().trim(),
|
|
'许可证号': licenseLinkTag.text().trim(),
|
|
'开发企业': $(columns[2]).text().trim(),
|
|
'项目名称': $(columns[3]).text().trim(),
|
|
'项目地址': $(columns[4]).text().trim(),
|
|
'批准时间': $(columns[5]).text().trim(),
|
|
'所在区域': $(columns[6]).text().trim(),
|
|
'总套数': $(columns[7]).text().trim(),
|
|
'可售套数': $(columns[8]).text().trim(),
|
|
'许可证链接': '', // Initialize
|
|
};
|
|
|
|
// The link is inside an onclick attribute, not a standard href
|
|
const onclickAttr = licenseLinkTag.attr('onclick');
|
|
if (onclickAttr) {
|
|
// Make regex flexible to handle single or double quotes
|
|
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
|
if (match && match[1]) {
|
|
rowData['许可证链接'] = await getEncryptedUrl(match[1], origin);
|
|
}
|
|
}
|
|
return rowData;
|
|
});
|
|
|
|
const data = await Promise.all(dataPromises);
|
|
return data.filter(item => item !== null);
|
|
}
|
|
|
|
/**
|
|
* Performs post-processing on the scraped data:
|
|
* 1. Filters out records where "许可证号" is "空".
|
|
* 2. Re-indexes "序号" sequentially using the utility function.
|
|
* 3. Converts "总套数" and "可售套数" fields to numbers.
|
|
* @param {Array<Object>} allData - The raw scraped data.
|
|
* @returns {Promise<Array<Object>>} - The processed data.
|
|
*/
|
|
async function processScrapedData(allData) {
|
|
// 1. Filter out records where "许可证号" is "空" or null
|
|
let processedData = allData.filter(record => record['许可证号'] !== '空' && record['许可证号']);
|
|
console.log(`删除无效许可证记录后,剩余 ${processedData.length} 条记录。`);
|
|
|
|
// 2. Re-index "序号" sequentially
|
|
reIndex(processedData); // Use the shared utility function
|
|
|
|
// 3. Convert "总套数" and "可售套数" to numbers
|
|
for (const record of processedData) {
|
|
record['总套数'] = parseInt(record['总套数'], 10) || 0;
|
|
record['可售套数'] = parseInt(record['可售套数'], 10) || 0;
|
|
}
|
|
console.log('"总套数" 和 "可售套数" 字段已转换为数字。');
|
|
|
|
return processedData;
|
|
}
|
|
|
|
// 主函数 - 导出以便在根目录调用
|
|
export async function main() {
|
|
try {
|
|
console.log('🚀 开始抓取预售许可证数据...');
|
|
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
|
|
|
|
const processedData = await processScrapedData(allData);
|
|
|
|
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
|
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
|
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
|
console.log(`✅ 更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
|
|
|
console.log('\n所有数据处理和文件生成任务已完成。');
|
|
|
|
} catch (error) {
|
|
console.error('在 getPreSaleLicense.js 抓取或处理过程中发生错误:', error.message);
|
|
if (error.response) {
|
|
console.error('Status:', error.response.status);
|
|
}
|
|
throw error; // Re-throw the error to be caught by the caller
|
|
}
|
|
} |