refactor
This commit is contained in:
parent
2f0fbab19a
commit
4b5cf56dd6
@ -4302,7 +4302,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1幢",
|
||||
"成交均价": "7274.7",
|
||||
"成交均价": "7266.38",
|
||||
"bid": "1099705"
|
||||
}
|
||||
],
|
||||
@ -4313,7 +4313,7 @@
|
||||
"批准时间": "2025-10-24",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 160,
|
||||
"可售套数": 102,
|
||||
"可售套数": 101,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062"
|
||||
},
|
||||
{
|
||||
@ -284,7 +284,7 @@
|
||||
"批准时间": "2025-10-24",
|
||||
"所在区域": "普宁市",
|
||||
"总套数": 160,
|
||||
"可售套数": 102,
|
||||
"可售套数": 101,
|
||||
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062"
|
||||
},
|
||||
{
|
||||
@ -3294,7 +3294,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1幢",
|
||||
"成交均价": "7274.7",
|
||||
"成交均价": "7266.38",
|
||||
"bid": "1099705"
|
||||
}
|
||||
]
|
||||
@ -4,8 +4,8 @@
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js",
|
||||
"clear": "rm -f Project/data.json PreSaleLicense/data.json merged_data.json"
|
||||
"start": "node scripts/index.js",
|
||||
"clear": "node scripts/clear.js"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
||||
10
scripts/axios.js
Normal file
10
scripts/axios.js
Normal file
@ -0,0 +1,10 @@
|
||||
import axios from 'axios';
|
||||
|
||||
// 创建配置了默认User-Agent的axios实例
|
||||
const axiosInstance = axios.create({
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
|
||||
export default axiosInstance;
|
||||
30
scripts/clear.js
Normal file
30
scripts/clear.js
Normal file
@ -0,0 +1,30 @@
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
async function clearData() {
|
||||
const dataDir = path.join(__dirname, '..', 'data');
|
||||
|
||||
try {
|
||||
// 检查data目录是否存在
|
||||
await fs.access(dataDir);
|
||||
|
||||
// 删除data目录及其所有内容
|
||||
await fs.rm(dataDir, { recursive: true, force: true });
|
||||
|
||||
console.log('✅ 已清空 data 目录');
|
||||
|
||||
} catch (error) {
|
||||
if (error.code === 'ENOENT') {
|
||||
console.log('📁 data 目录不存在,无需清理');
|
||||
} else {
|
||||
console.error('❌ 清理失败:', error.message);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clearData();
|
||||
29
scripts/generateCsv.js
Normal file
29
scripts/generateCsv.js
Normal file
@ -0,0 +1,29 @@
|
||||
import fs from 'fs/promises';
|
||||
|
||||
/**
|
||||
* Generates a CSV file from the given data.
|
||||
* @param {Array<Object>} data - The data to convert to CSV.
|
||||
* @param {string} filePath - The path to save the CSV file.
|
||||
*/
|
||||
|
||||
async function generateCsv(data, filePath) {
|
||||
if (data.length === 0) {
|
||||
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
|
||||
return;
|
||||
}
|
||||
const headers = Object.keys(data[0]);
|
||||
const csvRows = [];
|
||||
csvRows.push(headers.map(header => `"${header}"`).join(','));
|
||||
for (const record of data) {
|
||||
const values = headers.map(header => {
|
||||
const value = record[header];
|
||||
// Ensure values are properly quoted and internal quotes are escaped
|
||||
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
|
||||
});
|
||||
csvRows.push(values.join(','));
|
||||
}
|
||||
const csvContent = csvRows.join('\n');
|
||||
const BOM = '\uFEFF'; // UTF-8 BOM
|
||||
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
|
||||
console.log(`已生成 ${filePath} 文件。`);
|
||||
}
|
||||
43
scripts/generateXlsx.js
Normal file
43
scripts/generateXlsx.js
Normal file
@ -0,0 +1,43 @@
|
||||
import ExcelJS from 'exceljs';
|
||||
|
||||
/**
|
||||
* Generates an XLSX file from the given data.
|
||||
* @param {Array<Object>} data - The data to convert to XLSX.
|
||||
* @param {string} filePath - The path to save the XLSX file.
|
||||
*/
|
||||
|
||||
async function generateXlsx(data, filePath) {
|
||||
if (data.length === 0) {
|
||||
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
|
||||
return;
|
||||
}
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
const worksheet = workbook.addWorksheet('Data');
|
||||
|
||||
const headers = Object.keys(data[0]);
|
||||
worksheet.columns = headers.map(key => ({
|
||||
header: key,
|
||||
key: key,
|
||||
width: key.includes('地址') || key.includes('链接') ? 40 : 20
|
||||
}));
|
||||
worksheet.addRows(data);
|
||||
|
||||
worksheet.getRow(1).eachCell(cell => {
|
||||
cell.font = { bold: true };
|
||||
cell.fill = {
|
||||
type: 'pattern',
|
||||
pattern: 'solid',
|
||||
fgColor: { argb: 'FFDDDDDD' }
|
||||
};
|
||||
cell.alignment = { vertical: 'middle', horizontal: 'center' };
|
||||
});
|
||||
worksheet.autoFilter = {
|
||||
from: 'A1',
|
||||
to: {
|
||||
row: 1,
|
||||
column: headers.length
|
||||
}
|
||||
};
|
||||
await workbook.xlsx.writeFile(filePath);
|
||||
console.log(`已生成 ${filePath} 文件。`);
|
||||
}
|
||||
@ -1,7 +1,7 @@
|
||||
import axios from 'axios';
|
||||
import axios from './axios.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation
|
||||
import path from 'path';
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||||
@ -57,11 +57,7 @@ function extractDataFromHtml($) {
|
||||
async function scrapeWebsite() {
|
||||
let allData = [];
|
||||
console.log('开始抓取第一页数据...');
|
||||
let response = await axios.get(START_URL, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
allData = extractDataFromHtml($);
|
||||
@ -108,7 +104,6 @@ async function scrapeWebsite() {
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
@ -157,75 +152,6 @@ async function processScrapedData(allData) {
|
||||
return processedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a CSV file from the given data.
|
||||
* @param {Array<Object>} data - The data to convert to CSV.
|
||||
* @param {string} filePath - The path to save the CSV file.
|
||||
*/
|
||||
async function generateCsv(data, filePath) {
|
||||
if (data.length === 0) {
|
||||
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
|
||||
return;
|
||||
}
|
||||
const headers = Object.keys(data[0]);
|
||||
const csvRows = [];
|
||||
csvRows.push(headers.map(header => `"${header}"`).join(','));
|
||||
for (const record of data) {
|
||||
const values = headers.map(header => {
|
||||
const value = record[header];
|
||||
// Ensure values are properly quoted and internal quotes are escaped
|
||||
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
|
||||
});
|
||||
csvRows.push(values.join(','));
|
||||
}
|
||||
const csvContent = csvRows.join('\n');
|
||||
const BOM = '\uFEFF'; // UTF-8 BOM
|
||||
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
|
||||
console.log(`已生成 ${filePath} 文件。`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates an XLSX file from the given data.
|
||||
* @param {Array<Object>} data - The data to convert to XLSX.
|
||||
* @param {string} filePath - The path to save the XLSX file.
|
||||
*/
|
||||
async function generateXlsx(data, filePath) {
|
||||
if (data.length === 0) {
|
||||
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
|
||||
return;
|
||||
}
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
const worksheet = workbook.addWorksheet('Data');
|
||||
|
||||
const headers = Object.keys(data[0]);
|
||||
worksheet.columns = headers.map(key => ({
|
||||
header: key,
|
||||
key: key,
|
||||
width: key.includes('地址') || key.includes('链接') ? 40 : 20
|
||||
}));
|
||||
worksheet.addRows(data);
|
||||
|
||||
worksheet.getRow(1).eachCell(cell => {
|
||||
cell.font = { bold: true };
|
||||
cell.fill = {
|
||||
type: 'pattern',
|
||||
pattern:'solid',
|
||||
fgColor:{argb:'FFDDDDDD'}
|
||||
};
|
||||
cell.alignment = { vertical: 'middle', horizontal: 'center' };
|
||||
});
|
||||
worksheet.autoFilter = {
|
||||
from: 'A1',
|
||||
to: {
|
||||
row: 1,
|
||||
column: headers.length
|
||||
}
|
||||
};
|
||||
await workbook.xlsx.writeFile(filePath);
|
||||
console.log(`已生成 ${filePath} 文件。`);
|
||||
}
|
||||
|
||||
|
||||
// 主函数 - 导出以便在根目录调用
|
||||
export async function main() {
|
||||
try {
|
||||
@ -233,11 +159,9 @@ export async function main() {
|
||||
|
||||
const processedData = await processScrapedData(allData);
|
||||
|
||||
await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log(`更新后的数据已保存至 data.json 文件。`);
|
||||
|
||||
// await generateCsv(processedData, './data.csv');
|
||||
// await generateXlsx(processedData, './data.xlsx');
|
||||
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
||||
|
||||
console.log('\n所有数据处理和文件生成任务已完成。');
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import axios from 'axios';
|
||||
import axios from './axios.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
@ -27,10 +27,7 @@ async function getEncryptedUrl(relativeUrl, origin) {
|
||||
const path = urlParts[0];
|
||||
const queryString = urlParts[1];
|
||||
|
||||
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?>
|
||||
<param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt">
|
||||
<item>${xmlEncode(queryString)}</item>
|
||||
</param>`;
|
||||
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
|
||||
|
||||
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
|
||||
|
||||
@ -38,7 +35,6 @@ async function getEncryptedUrl(relativeUrl, origin) {
|
||||
const response = await axios.post(encryptionUrl, xmlPayload, {
|
||||
headers: {
|
||||
'Content-Type': 'application/xml',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
}
|
||||
});
|
||||
const encryptedQuery = response.data;
|
||||
@ -98,11 +94,7 @@ async function scrapeProjectDetails(project) {
|
||||
|
||||
try {
|
||||
console.log(`正在抓取详情: ${project['项目名称']}`);
|
||||
const response = await axios.get(project['项目链接'], {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
const response = await axios.get(project['项目链接']);
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
// 抓取 housedetail 表格中的所有信息
|
||||
@ -177,11 +169,7 @@ async function scrapeWebsite() {
|
||||
console.log('开始抓取第一页数据...');
|
||||
const origin = new URL(START_URL).origin;
|
||||
|
||||
let response = await axios.get(START_URL, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
let firstPageData = await extractDataFromHtml($, origin);
|
||||
@ -233,7 +221,6 @@ async function scrapeWebsite() {
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
@ -282,8 +269,9 @@ export async function main() {
|
||||
const processedData = await processScrapedData(allData);
|
||||
|
||||
// 保存为JSON文件
|
||||
await fs.writeFile(path.join(__dirname, 'data.json'), JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log('项目数据已保存至 data.json 文件。');
|
||||
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log('项目数据已保存至 data/project.json 文件。');
|
||||
|
||||
console.log('\n所有数据抓取和处理任务已完成。');
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { main as projectMain } from './Project/index.js';
|
||||
import { main as licenseMain } from './PreSaleLicense/index.js';
|
||||
import { main as getProject } from './getProject.js';
|
||||
import { main as getPreSaleLicense } from './getPreSaleLicense.js';
|
||||
|
||||
// 检查数据文件是否存在
|
||||
async function checkDataFileExists(dataPath) {
|
||||
@ -14,7 +14,7 @@ async function checkDataFileExists(dataPath) {
|
||||
}
|
||||
|
||||
// 执行脚本函数并等待完成
|
||||
async function runScript(scriptFunction, description, cwd, dataFilePath) {
|
||||
async function runScript(scriptFunction, description, dataFilePath) {
|
||||
console.log(`开始执行: ${description}`);
|
||||
|
||||
// 检查数据文件是否已存在
|
||||
@ -25,19 +25,8 @@ async function runScript(scriptFunction, description, cwd, dataFilePath) {
|
||||
}
|
||||
|
||||
try {
|
||||
// 临时改变工作目录
|
||||
const originalCwd = process.cwd();
|
||||
if (cwd) {
|
||||
process.chdir(cwd);
|
||||
}
|
||||
|
||||
await scriptFunction();
|
||||
|
||||
// 恢复原始工作目录
|
||||
if (cwd) {
|
||||
process.chdir(originalCwd);
|
||||
}
|
||||
|
||||
console.log(`✅ ${description} 执行完成\n`);
|
||||
return true;
|
||||
} catch (error) {
|
||||
@ -52,11 +41,11 @@ async function mergeLicenseData() {
|
||||
|
||||
try {
|
||||
// 读取项目数据
|
||||
const projectDataPath = path.join(process.cwd(), 'Project', 'data.json');
|
||||
const projectDataPath = path.join(process.cwd(), 'data', 'project.json');
|
||||
const projectData = JSON.parse(await fs.readFile(projectDataPath, 'utf-8'));
|
||||
|
||||
// 读取许可证数据
|
||||
const licenseDataPath = path.join(process.cwd(), 'PreSaleLicense', 'data.json');
|
||||
const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json');
|
||||
const licenseData = JSON.parse(await fs.readFile(licenseDataPath, 'utf-8'));
|
||||
|
||||
// 创建许可证号到许可证数据的映射
|
||||
@ -93,7 +82,7 @@ async function mergeLicenseData() {
|
||||
console.log(`✅ 成功合并 ${mergedCount} 个许可证详情`);
|
||||
|
||||
// 保存合并后的数据
|
||||
const outputPath = path.join(process.cwd(), 'merged_data.json');
|
||||
const outputPath = path.join(process.cwd(), 'data', 'merged_data.json');
|
||||
await fs.writeFile(outputPath, JSON.stringify(projectData, null, 4), 'utf-8');
|
||||
|
||||
console.log(`📁 合并后的数据已保存至: ${outputPath}`);
|
||||
@ -113,17 +102,15 @@ async function main() {
|
||||
|
||||
try {
|
||||
// 步骤1: 获取项目数据
|
||||
const projectCwd = path.join(process.cwd(), 'Project');
|
||||
const projectDataPath = path.join(projectCwd, 'data.json');
|
||||
const projectSuccess = await runScript(projectMain, '项目信息抓取脚本', projectCwd, projectDataPath);
|
||||
const projectDataPath = path.join(process.cwd(), 'data', 'project.json');
|
||||
const projectSuccess = await runScript(getProject, '项目信息抓取脚本', projectDataPath);
|
||||
if (!projectSuccess) {
|
||||
throw new Error('项目信息抓取失败');
|
||||
}
|
||||
|
||||
// 步骤2: 获取许可证数据
|
||||
const licenseCwd = path.join(process.cwd(), 'PreSaleLicense');
|
||||
const licenseDataPath = path.join(licenseCwd, 'data.json');
|
||||
const licenseSuccess = await runScript(licenseMain, '预售许可证抓取脚本', licenseCwd, licenseDataPath);
|
||||
const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json');
|
||||
const licenseSuccess = await runScript(getPreSaleLicense, '预售许可证抓取脚本', licenseDataPath);
|
||||
if (!licenseSuccess) {
|
||||
throw new Error('预售许可证抓取失败');
|
||||
}
|
||||
@ -132,7 +119,7 @@ async function main() {
|
||||
await mergeLicenseData();
|
||||
|
||||
console.log('\n🎉 所有数据处理完成!');
|
||||
console.log('📁 输出文件: merged_data.json');
|
||||
console.log('📁 输出文件: data/merged_data.json');
|
||||
|
||||
} catch (error) {
|
||||
console.error('\n💥 处理过程中发生错误:', error.message);
|
||||
Loading…
Reference in New Issue
Block a user