puning-real-estate/scripts/getCompanies.js
2026-01-22 16:04:40 +08:00

85 lines
3.3 KiB
JavaScript

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
import { getEncryptedUrl } from './getEncryptedUrl.js';
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HEMS/';
const START_URL = BASE_URL + 'CompanyList.aspx';
/**
* Extracts company data from HTML content.
* @param {object} $ - Cheerio object.
* @param {string} origin - The origin URL for resolving relative links.
* @returns {Promise<Array<Object>>} - An array of company data records.
*/
async function extractDataFromHtml($, origin) {
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
const columns = $(row).find('td');
if (columns.length < 5) { // Expecting 5 columns: 序号, 营业执照注册号, 企业名称, 法人代表, 地址
return null;
}
const companyNameCell = $(columns[2]); // 企业名称 is the 3rd column
const companyNameLink = companyNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'营业执照注册号': $(columns[1]).text().trim(),
'企业名称': companyNameLink.length ? companyNameLink.text().trim() : companyNameCell.text().trim(),
'法人代表': $(columns[3]).text().trim(),
'地址': $(columns[4]).text().trim(),
'企业链接': '', // Initialize
};
if (companyNameLink.length) {
const onclickAttr = companyNameLink.attr('onclick');
if (onclickAttr) {
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
rowData['企业链接'] = await getEncryptedUrl(match[1], origin);
}
}
}
return rowData;
});
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
}
// Main function to orchestrate the scraping and saving process
export async function main() {
try {
console.log('🚀 开始抓取普宁房地产开发企业列表...');
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
const processedData = reIndex(allData); // Use generic re-indexing
const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`✅ 公司列表数据已保存至 data/companies.json 文件。`);
console.log('\n所有公司数据抓取和处理任务已完成。');
} catch (error) {
console.error('❌ 抓取或处理公司列表过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
console.error('Data:', error.response.data);
}
process.exit(1); // Exit with error code
}
}
// Self-execution logic
if (__filename === process.argv[1]) {
main();
}