puning-real-estate/scripts/getCompanies.js

182 lines
6.7 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url'; // Add fileURLToPath import
import { getEncryptedUrl } from './getEncryptedUrl.js'; // Import shared encryption function
const __filename = fileURLToPath(import.meta.url); // Define __filename
const __dirname = path.dirname(__filename); // Define __dirname
const BASE_URL = 'http://120.236.48.169:89/HEMS/';
const START_URL = BASE_URL + 'CompanyList.aspx';
/**
* Extracts company data from HTML content.
* @param {object} $ - Cheerio object.
* @param {string} origin - The origin URL for resolving relative links.
* @returns {Promise<Array<Object>>} - An array of company data records.
*/
async function extractDataFromHtml($, origin) {
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
const columns = $(row).find('td');
if (columns.length < 5) { // Expecting 5 columns: 序号, 营业执照注册号, 企业名称, 法人代表, 地址
return null;
}
const companyNameCell = $(columns[2]); // 企业名称 is the 3rd column
const companyNameLink = companyNameCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'营业执照注册号': $(columns[1]).text().trim(),
'企业名称': companyNameLink.length ? companyNameLink.text().trim() : companyNameCell.text().trim(),
'法人代表': $(columns[3]).text().trim(),
'地址': $(columns[4]).text().trim(),
'企业链接': '', // Initialize
};
if (companyNameLink.length) {
const onclickAttr = companyNameLink.attr('onclick');
if (onclickAttr) {
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
rowData['企业链接'] = await getEncryptedUrl(match[1], origin);
}
}
}
return rowData;
});
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
}
/**
* Scrapes all company data from the target website, handling pagination.
* @returns {Promise<Array<Object>>} - An array of all scraped data records.
*/
async function scrapeWebsite() {
let allData = [];
const origin = new URL(START_URL).origin;
console.log('开始抓取公司列表第一页数据...');
let response = await axios.get(START_URL);
let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin);
allData = allData.concat(firstPageData);
console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`);
// Get total pages for pagination
const totalRecordsSpan = $('#PageNavigator1_LblRecordCount');
const totalPagesSpan = $('#PageNavigator1_LblPageCount');
const totalRecords = totalRecordsSpan.length ? parseInt(totalRecordsSpan.text(), 10) : 0;
const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1;
console.log(`共发现 ${totalRecords} 条记录,分为 ${totalPages} 页。`);
// Collect form data for POST requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Scrape remaining pages
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if (eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// Add form fields
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
console.log(`\n抓取全站公司数据完毕!共 ${allData.length} 条原始记录。`);
return allData;
}
/**
* Performs post-processing on the scraped data:
* 1. Re-indexes "序号" sequentially.
* @param {Array<Object>} allData - The raw scraped data.
* @returns {Array<Object>} - The processed data.
*/
function processScrapedData(allData) {
// Re-index "序号" sequentially
for (let i = 0; i < allData.length; i++) {
allData[i]['序号'] = (i + 1).toString();
}
console.log('序号字段已重新编号。');
return allData;
}
// Main function to orchestrate the scraping and saving process
export async function main() {
try {
console.log('🚀 开始抓取普宁房地产开发企业列表...');
const allData = await scrapeWebsite();
const processedData = processScrapedData(allData);
const dataPath = path.join(__dirname, '..', 'data', 'companies.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`✅ 公司列表数据已保存至 data/companies.json 文件。`);
console.log('\n所有公司数据抓取和处理任务已完成。');
} catch (error) {
console.error('❌ 抓取或处理公司列表过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
console.error('Data:', error.response.data);
}
process.exit(1); // Exit with error code
}
}
// Self-execution logic
if (__filename === process.argv[1]) {
main();
}