182 lines
6.7 KiB
JavaScript
182 lines
6.7 KiB
JavaScript
import axios from './axios.js';
|
||
import * as cheerio from 'cheerio';
|
||
import fs from 'fs/promises';
|
||
import path from 'path';
|
||
import { fileURLToPath } from 'url'; // Add fileURLToPath import
|
||
import { getEncryptedUrl } from './getEncryptedUrl.js'; // Import shared encryption function
|
||
|
||
const __filename = fileURLToPath(import.meta.url); // Define __filename
|
||
const __dirname = path.dirname(__filename); // Define __dirname
|
||
|
||
const BASE_URL = 'http://120.236.48.169:89/HEMS/';
|
||
const START_URL = BASE_URL + 'CompanyList.aspx';
|
||
|
||
/**
|
||
* Extracts company data from HTML content.
|
||
* @param {object} $ - Cheerio object.
|
||
* @param {string} origin - The origin URL for resolving relative links.
|
||
* @returns {Promise<Array<Object>>} - An array of company data records.
|
||
*/
|
||
async function extractDataFromHtml($, origin) {
|
||
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
|
||
const columns = $(row).find('td');
|
||
if (columns.length < 5) { // Expecting 5 columns: 序号, 营业执照注册号, 企业名称, 法人代表, 地址
|
||
return null;
|
||
}
|
||
|
||
const companyNameCell = $(columns[2]); // 企业名称 is the 3rd column
|
||
const companyNameLink = companyNameCell.find('a');
|
||
|
||
const rowData = {
|
||
'序号': $(columns[0]).text().trim(),
|
||
'营业执照注册号': $(columns[1]).text().trim(),
|
||
'企业名称': companyNameLink.length ? companyNameLink.text().trim() : companyNameCell.text().trim(),
|
||
'法人代表': $(columns[3]).text().trim(),
|
||
'地址': $(columns[4]).text().trim(),
|
||
'企业链接': '', // Initialize
|
||
};
|
||
|
||
if (companyNameLink.length) {
|
||
const onclickAttr = companyNameLink.attr('onclick');
|
||
if (onclickAttr) {
|
||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||
if (match && match[1]) {
|
||
rowData['企业链接'] = await getEncryptedUrl(match[1], origin);
|
||
}
|
||
}
|
||
}
|
||
return rowData;
|
||
});
|
||
|
||
const data = await Promise.all(dataPromises);
|
||
return data.filter(item => item !== null);
|
||
}
|
||
|
||
/**
|
||
* Scrapes all company data from the target website, handling pagination.
|
||
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
|
||
*/
|
||
async function scrapeWebsite() {
|
||
let allData = [];
|
||
const origin = new URL(START_URL).origin;
|
||
|
||
console.log('开始抓取公司列表第一页数据...');
|
||
let response = await axios.get(START_URL);
|
||
|
||
let $ = cheerio.load(response.data);
|
||
let firstPageData = await extractDataFromHtml($, origin);
|
||
allData = allData.concat(firstPageData);
|
||
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
|
||
|
||
// Get total pages for pagination
|
||
const totalRecordsSpan = $('#PageNavigator1_LblRecordCount');
|
||
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
|
||
|
||
const totalRecords = totalRecordsSpan.length ? parseInt(totalRecordsSpan.text(), 10) : 0;
|
||
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
|
||
console.log(`共发现 ${totalRecords} 条记录,分为 ${totalPages} 页。`);
|
||
|
||
// Collect form data for POST requests
|
||
const formValues = {};
|
||
$('input[name^="txt"], select').each((idx, el) => {
|
||
const name = $(el).attr('name');
|
||
if (name) {
|
||
formValues[name] = $(el).val() || '';
|
||
}
|
||
});
|
||
|
||
// Scrape remaining pages
|
||
for (let i = 2; i <= totalPages; i++) {
|
||
console.log(`正在抓取第 ${i} 页...`);
|
||
|
||
const viewState = $('#__VIEWSTATE').val();
|
||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||
|
||
if (!viewState) {
|
||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||
break;
|
||
}
|
||
|
||
const postData = new URLSearchParams();
|
||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||
postData.append('__EVENTARGUMENT', '');
|
||
postData.append('__VIEWSTATE', viewState);
|
||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||
if (eventValidation) {
|
||
postData.append('__EVENTVALIDATION', eventValidation);
|
||
}
|
||
|
||
// Add form fields
|
||
for (const name in formValues) {
|
||
postData.append(name, formValues[name]);
|
||
}
|
||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||
|
||
response = await axios.post(START_URL, postData, {
|
||
headers: {
|
||
'Content-Type': 'application/x-www-form-urlencoded',
|
||
'Referer': START_URL,
|
||
}
|
||
});
|
||
|
||
$ = cheerio.load(response.data);
|
||
const nextPageData = await extractDataFromHtml($, origin);
|
||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||
|
||
if (nextPageData.length === 0) {
|
||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||
break;
|
||
}
|
||
allData = allData.concat(nextPageData);
|
||
}
|
||
|
||
console.log(`\n抓取全站公司数据完毕!共 ${allData.length} 条原始记录。`);
|
||
return allData;
|
||
}
|
||
|
||
/**
|
||
* Performs post-processing on the scraped data:
|
||
* 1. Re-indexes "序号" sequentially.
|
||
* @param {Array<Object>} allData - The raw scraped data.
|
||
* @returns {Array<Object>} - The processed data.
|
||
*/
|
||
function processScrapedData(allData) {
|
||
// Re-index "序号" sequentially
|
||
for (let i = 0; i < allData.length; i++) {
|
||
allData[i]['序号'] = (i + 1).toString();
|
||
}
|
||
console.log('序号字段已重新编号。');
|
||
|
||
return allData;
|
||
}
|
||
|
||
// Main function to orchestrate the scraping and saving process
|
||
export async function main() {
|
||
try {
|
||
console.log('🚀 开始抓取普宁房地产开发企业列表...');
|
||
const allData = await scrapeWebsite();
|
||
const processedData = processScrapedData(allData);
|
||
|
||
const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
|
||
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists
|
||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||
console.log(`✅ 公司列表数据已保存至 data/companies.json 文件。`);
|
||
|
||
console.log('\n所有公司数据抓取和处理任务已完成。');
|
||
|
||
} catch (error) {
|
||
console.error('❌ 抓取或处理公司列表过程中发生错误:', error.message);
|
||
if (error.response) {
|
||
console.error('Status:', error.response.status);
|
||
console.error('Data:', error.response.data);
|
||
}
|
||
process.exit(1); // Exit with error code
|
||
}
|
||
}
|
||
|
||
// Self-execution logic
|
||
if (__filename === process.argv[1]) {
|
||
main();
|
||
}
|