import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { fileURLToPath } from 'url'; import { getEncryptedUrl } from './getEncryptedUrl.js'; import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const BASE_URL = 'http://120.236.48.169:89/HEMS/'; const START_URL = BASE_URL + 'CompanyList.aspx'; /** * Extracts company data from HTML content. * @param {object} $ - Cheerio object. * @param {string} origin - The origin URL for resolving relative links. * @returns {Promise>} - An array of company data records. */ async function extractDataFromHtml($, origin) { const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => { const columns = $(row).find('td'); if (columns.length < 5) { // Expecting 5 columns: 序号, 营业执照注册号, 企业名称, 法人代表, 地址 return null; } const companyNameCell = $(columns[2]); // 企业名称 is the 3rd column const companyNameLink = companyNameCell.find('a'); const rowData = { '序号': $(columns[0]).text().trim(), '营业执照注册号': $(columns[1]).text().trim(), '企业名称': companyNameLink.length ? companyNameLink.text().trim() : companyNameCell.text().trim(), '法人代表': $(columns[3]).text().trim(), '地址': $(columns[4]).text().trim(), '企业链接': '', // Initialize }; if (companyNameLink.length) { const onclickAttr = companyNameLink.attr('onclick'); if (onclickAttr) { const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); if (match && match[1]) { rowData['企业链接'] = await getEncryptedUrl(match[1], origin); } } } return rowData; }); const data = await Promise.all(dataPromises); return data.filter(item => item !== null); } // Main function to orchestrate the scraping and saving process export async function main() { try { console.log('🚀 开始抓取普宁房地产开发企业列表...'); const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination const processedData = reIndex(allData); // Use generic re-indexing const dataPath = path.join(__dirname, '..', 'data', 'companies.json'); await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); console.log(`✅ 公司列表数据已保存至 data/companies.json 文件。`); console.log('\n所有公司数据抓取和处理任务已完成。'); } catch (error) { console.error('❌ 抓取或处理公司列表过程中发生错误:', error.message); if (error.response) { console.error('Status:', error.response.status); console.error('Data:', error.response.data); } process.exit(1); // Exit with error code } } // Self-execution logic if (__filename === process.argv[1]) { main(); }