85 lines
3.3 KiB
JavaScript
85 lines
3.3 KiB
JavaScript
import axios from './axios.js';
|
|
import * as cheerio from 'cheerio';
|
|
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
|
import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
const BASE_URL = 'http://120.236.48.169:89/HEMS/';
|
|
const START_URL = BASE_URL + 'CompanyList.aspx';
|
|
|
|
/**
|
|
* Extracts company data from HTML content.
|
|
* @param {object} $ - Cheerio object.
|
|
* @param {string} origin - The origin URL for resolving relative links.
|
|
* @returns {Promise<Array<Object>>} - An array of company data records.
|
|
*/
|
|
async function extractDataFromHtml($, origin) {
|
|
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
|
|
const columns = $(row).find('td');
|
|
if (columns.length < 5) { // Expecting 5 columns: 序号, 营业执照注册号, 企业名称, 法人代表, 地址
|
|
return null;
|
|
}
|
|
|
|
const companyNameCell = $(columns[2]); // 企业名称 is the 3rd column
|
|
const companyNameLink = companyNameCell.find('a');
|
|
|
|
const rowData = {
|
|
'序号': $(columns[0]).text().trim(),
|
|
'营业执照注册号': $(columns[1]).text().trim(),
|
|
'企业名称': companyNameLink.length ? companyNameLink.text().trim() : companyNameCell.text().trim(),
|
|
'法人代表': $(columns[3]).text().trim(),
|
|
'地址': $(columns[4]).text().trim(),
|
|
'企业链接': '', // Initialize
|
|
};
|
|
|
|
if (companyNameLink.length) {
|
|
const onclickAttr = companyNameLink.attr('onclick');
|
|
if (onclickAttr) {
|
|
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
|
if (match && match[1]) {
|
|
rowData['企业链接'] = await getEncryptedUrl(match[1], origin);
|
|
}
|
|
}
|
|
}
|
|
return rowData;
|
|
});
|
|
|
|
const data = await Promise.all(dataPromises);
|
|
return data.filter(item => item !== null);
|
|
}
|
|
|
|
|
|
// Main function to orchestrate the scraping and saving process
|
|
export async function main() {
|
|
try {
|
|
console.log('🚀 开始抓取普宁房地产开发企业列表...');
|
|
const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination
|
|
const processedData = reIndex(allData); // Use generic re-indexing
|
|
|
|
const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
|
|
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists
|
|
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
|
console.log(`✅ 公司列表数据已保存至 data/companies.json 文件。`);
|
|
|
|
console.log('\n所有公司数据抓取和处理任务已完成。');
|
|
|
|
} catch (error) {
|
|
console.error('❌ 抓取或处理公司列表过程中发生错误:', error.message);
|
|
if (error.response) {
|
|
console.error('Status:', error.response.status);
|
|
console.error('Data:', error.response.data);
|
|
}
|
|
process.exit(1); // Exit with error code
|
|
}
|
|
}
|
|
|
|
// Self-execution logic
|
|
if (__filename === process.argv[1]) {
|
|
main();
|
|
}
|