This commit is contained in:
秦秋旭 2026-01-22 10:10:57 +08:00
parent 2f0fbab19a
commit 4b5cf56dd6
11 changed files with 143 additions and 132 deletions

View File

@ -4302,7 +4302,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1幢", "楼幢名称": "1幢",
"成交均价": "7274.7", "成交均价": "7266.38",
"bid": "1099705" "bid": "1099705"
} }
], ],
@ -4313,7 +4313,7 @@
"批准时间": "2025-10-24", "批准时间": "2025-10-24",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 160, "总套数": 160,
"可售套数": 102, "可售套数": 101,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062"
}, },
{ {

View File

@ -284,7 +284,7 @@
"批准时间": "2025-10-24", "批准时间": "2025-10-24",
"所在区域": "普宁市", "所在区域": "普宁市",
"总套数": 160, "总套数": 160,
"可售套数": 102, "可售套数": 101,
"许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062" "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062"
}, },
{ {

View File

@ -3294,7 +3294,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1幢", "楼幢名称": "1幢",
"成交均价": "7274.7", "成交均价": "7266.38",
"bid": "1099705" "bid": "1099705"
} }
] ]

View File

@ -4,8 +4,8 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"start": "node index.js", "start": "node scripts/index.js",
"clear": "rm -f Project/data.json PreSaleLicense/data.json merged_data.json" "clear": "node scripts/clear.js"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",
@ -17,4 +17,4 @@
"exceljs": "^4.4.0" "exceljs": "^4.4.0"
}, },
"type": "module" "type": "module"
} }

10
scripts/axios.js Normal file
View File

@ -0,0 +1,10 @@
import axios from 'axios';
// 创建配置了默认User-Agent的axios实例
const axiosInstance = axios.create({
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
export default axiosInstance;

30
scripts/clear.js Normal file
View File

@ -0,0 +1,30 @@
import fs from 'fs/promises';
import path from 'path';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function clearData() {
const dataDir = path.join(__dirname, '..', 'data');
try {
// 检查data目录是否存在
await fs.access(dataDir);
// 删除data目录及其所有内容
await fs.rm(dataDir, { recursive: true, force: true });
console.log('✅ 已清空 data 目录');
} catch (error) {
if (error.code === 'ENOENT') {
console.log('📁 data 目录不存在,无需清理');
} else {
console.error('❌ 清理失败:', error.message);
process.exit(1);
}
}
}
clearData();

29
scripts/generateCsv.js Normal file
View File

@ -0,0 +1,29 @@
import fs from 'fs/promises';
/**
* Generates a CSV file from the given data.
* @param {Array<Object>} data - The data to convert to CSV.
* @param {string} filePath - The path to save the CSV file.
*/
async function generateCsv(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
return;
}
const headers = Object.keys(data[0]);
const csvRows = [];
csvRows.push(headers.map(header => `"${header}"`).join(','));
for (const record of data) {
const values = headers.map(header => {
const value = record[header];
// Ensure values are properly quoted and internal quotes are escaped
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
});
csvRows.push(values.join(','));
}
const csvContent = csvRows.join('\n');
const BOM = '\uFEFF'; // UTF-8 BOM
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
console.log(`已生成 ${filePath} 文件。`);
}

43
scripts/generateXlsx.js Normal file
View File

@ -0,0 +1,43 @@
import ExcelJS from 'exceljs';
/**
* Generates an XLSX file from the given data.
* @param {Array<Object>} data - The data to convert to XLSX.
* @param {string} filePath - The path to save the XLSX file.
*/
async function generateXlsx(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
return;
}
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('Data');
const headers = Object.keys(data[0]);
worksheet.columns = headers.map(key => ({
header: key,
key: key,
width: key.includes('地址') || key.includes('链接') ? 40 : 20
}));
worksheet.addRows(data);
worksheet.getRow(1).eachCell(cell => {
cell.font = { bold: true };
cell.fill = {
type: 'pattern',
pattern: 'solid',
fgColor: { argb: 'FFDDDDDD' }
};
cell.alignment = { vertical: 'middle', horizontal: 'center' };
});
worksheet.autoFilter = {
from: 'A1',
to: {
row: 1,
column: headers.length
}
};
await workbook.xlsx.writeFile(filePath);
console.log(`已生成 ${filePath} 文件。`);
}

View File

@ -1,7 +1,7 @@
import axios from 'axios'; import axios from './axios.js';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import fs from 'fs/promises'; import fs from 'fs/promises';
import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation import path from 'path';
const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx'; const START_URL = BASE_URL + 'presellCertList.aspx';
@ -57,11 +57,7 @@ function extractDataFromHtml($) {
async function scrapeWebsite() { async function scrapeWebsite() {
let allData = []; let allData = [];
console.log('开始抓取第一页数据...'); console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL, { let response = await axios.get(START_URL);
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data); let $ = cheerio.load(response.data);
allData = extractDataFromHtml($); allData = extractDataFromHtml($);
@ -108,7 +104,6 @@ async function scrapeWebsite() {
response = await axios.post(START_URL, postData, { response = await axios.post(START_URL, postData, {
headers: { headers: {
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL, 'Referer': START_URL,
} }
}); });
@ -157,75 +152,6 @@ async function processScrapedData(allData) {
return processedData; return processedData;
} }
/**
* Generates a CSV file from the given data.
* @param {Array<Object>} data - The data to convert to CSV.
* @param {string} filePath - The path to save the CSV file.
*/
async function generateCsv(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 CSV 文件 (${filePath})。`);
return;
}
const headers = Object.keys(data[0]);
const csvRows = [];
csvRows.push(headers.map(header => `"${header}"`).join(','));
for (const record of data) {
const values = headers.map(header => {
const value = record[header];
// Ensure values are properly quoted and internal quotes are escaped
return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`;
});
csvRows.push(values.join(','));
}
const csvContent = csvRows.join('\n');
const BOM = '\uFEFF'; // UTF-8 BOM
await fs.writeFile(filePath, BOM + csvContent, 'utf-8');
console.log(`已生成 ${filePath} 文件。`);
}
/**
* Generates an XLSX file from the given data.
* @param {Array<Object>} data - The data to convert to XLSX.
* @param {string} filePath - The path to save the XLSX file.
*/
async function generateXlsx(data, filePath) {
if (data.length === 0) {
console.log(`没有数据可生成 XLSX 文件 (${filePath})。`);
return;
}
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet('Data');
const headers = Object.keys(data[0]);
worksheet.columns = headers.map(key => ({
header: key,
key: key,
width: key.includes('地址') || key.includes('链接') ? 40 : 20
}));
worksheet.addRows(data);
worksheet.getRow(1).eachCell(cell => {
cell.font = { bold: true };
cell.fill = {
type: 'pattern',
pattern:'solid',
fgColor:{argb:'FFDDDDDD'}
};
cell.alignment = { vertical: 'middle', horizontal: 'center' };
});
worksheet.autoFilter = {
from: 'A1',
to: {
row: 1,
column: headers.length
}
};
await workbook.xlsx.writeFile(filePath);
console.log(`已生成 ${filePath} 文件。`);
}
// 主函数 - 导出以便在根目录调用 // 主函数 - 导出以便在根目录调用
export async function main() { export async function main() {
try { try {
@ -233,11 +159,9 @@ export async function main() {
const processedData = await processScrapedData(allData); const processedData = await processScrapedData(allData);
await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8'); const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
console.log(`更新后的数据已保存至 data.json 文件。`); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
// await generateCsv(processedData, './data.csv');
// await generateXlsx(processedData, './data.xlsx');
console.log('\n所有数据处理和文件生成任务已完成。'); console.log('\n所有数据处理和文件生成任务已完成。');

View File

@ -1,4 +1,4 @@
import axios from 'axios'; import axios from './axios.js';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
@ -27,10 +27,7 @@ async function getEncryptedUrl(relativeUrl, origin) {
const path = urlParts[0]; const path = urlParts[0];
const queryString = urlParts[1]; const queryString = urlParts[1];
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
<param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt">
<item>${xmlEncode(queryString)}</item>
</param>`;
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href; const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
@ -38,7 +35,6 @@ async function getEncryptedUrl(relativeUrl, origin) {
const response = await axios.post(encryptionUrl, xmlPayload, { const response = await axios.post(encryptionUrl, xmlPayload, {
headers: { headers: {
'Content-Type': 'application/xml', 'Content-Type': 'application/xml',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
} }
}); });
const encryptedQuery = response.data; const encryptedQuery = response.data;
@ -98,11 +94,7 @@ async function scrapeProjectDetails(project) {
try { try {
console.log(`正在抓取详情: ${project['项目名称']}`); console.log(`正在抓取详情: ${project['项目名称']}`);
const response = await axios.get(project['项目链接'], { const response = await axios.get(project['项目链接']);
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
const $ = cheerio.load(response.data); const $ = cheerio.load(response.data);
// 抓取 housedetail 表格中的所有信息 // 抓取 housedetail 表格中的所有信息
@ -177,11 +169,7 @@ async function scrapeWebsite() {
console.log('开始抓取第一页数据...'); console.log('开始抓取第一页数据...');
const origin = new URL(START_URL).origin; const origin = new URL(START_URL).origin;
let response = await axios.get(START_URL, { let response = await axios.get(START_URL);
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data); let $ = cheerio.load(response.data);
let firstPageData = await extractDataFromHtml($, origin); let firstPageData = await extractDataFromHtml($, origin);
@ -233,7 +221,6 @@ async function scrapeWebsite() {
response = await axios.post(START_URL, postData, { response = await axios.post(START_URL, postData, {
headers: { headers: {
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL, 'Referer': START_URL,
} }
}); });
@ -282,8 +269,9 @@ export async function main() {
const processedData = await processScrapedData(allData); const processedData = await processScrapedData(allData);
// 保存为JSON文件 // 保存为JSON文件
await fs.writeFile(path.join(__dirname, 'data.json'), JSON.stringify(processedData, null, 4), 'utf-8'); const dataPath = path.join(__dirname, '..', 'data', 'project.json');
console.log('项目数据已保存至 data.json 文件。'); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log('项目数据已保存至 data/project.json 文件。');
console.log('\n所有数据抓取和处理任务已完成。'); console.log('\n所有数据抓取和处理任务已完成。');

View File

@ -1,7 +1,7 @@
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { main as projectMain } from './Project/index.js'; import { main as getProject } from './getProject.js';
import { main as licenseMain } from './PreSaleLicense/index.js'; import { main as getPreSaleLicense } from './getPreSaleLicense.js';
// 检查数据文件是否存在 // 检查数据文件是否存在
async function checkDataFileExists(dataPath) { async function checkDataFileExists(dataPath) {
@ -14,7 +14,7 @@ async function checkDataFileExists(dataPath) {
} }
// 执行脚本函数并等待完成 // 执行脚本函数并等待完成
async function runScript(scriptFunction, description, cwd, dataFilePath) { async function runScript(scriptFunction, description, dataFilePath) {
console.log(`开始执行: ${description}`); console.log(`开始执行: ${description}`);
// 检查数据文件是否已存在 // 检查数据文件是否已存在
@ -25,19 +25,8 @@ async function runScript(scriptFunction, description, cwd, dataFilePath) {
} }
try { try {
// 临时改变工作目录
const originalCwd = process.cwd();
if (cwd) {
process.chdir(cwd);
}
await scriptFunction(); await scriptFunction();
// 恢复原始工作目录
if (cwd) {
process.chdir(originalCwd);
}
console.log(`${description} 执行完成\n`); console.log(`${description} 执行完成\n`);
return true; return true;
} catch (error) { } catch (error) {
@ -52,11 +41,11 @@ async function mergeLicenseData() {
try { try {
// 读取项目数据 // 读取项目数据
const projectDataPath = path.join(process.cwd(), 'Project', 'data.json'); const projectDataPath = path.join(process.cwd(), 'data', 'project.json');
const projectData = JSON.parse(await fs.readFile(projectDataPath, 'utf-8')); const projectData = JSON.parse(await fs.readFile(projectDataPath, 'utf-8'));
// 读取许可证数据 // 读取许可证数据
const licenseDataPath = path.join(process.cwd(), 'PreSaleLicense', 'data.json'); const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json');
const licenseData = JSON.parse(await fs.readFile(licenseDataPath, 'utf-8')); const licenseData = JSON.parse(await fs.readFile(licenseDataPath, 'utf-8'));
// 创建许可证号到许可证数据的映射 // 创建许可证号到许可证数据的映射
@ -93,7 +82,7 @@ async function mergeLicenseData() {
console.log(`✅ 成功合并 ${mergedCount} 个许可证详情`); console.log(`✅ 成功合并 ${mergedCount} 个许可证详情`);
// 保存合并后的数据 // 保存合并后的数据
const outputPath = path.join(process.cwd(), 'merged_data.json'); const outputPath = path.join(process.cwd(), 'data', 'merged_data.json');
await fs.writeFile(outputPath, JSON.stringify(projectData, null, 4), 'utf-8'); await fs.writeFile(outputPath, JSON.stringify(projectData, null, 4), 'utf-8');
console.log(`📁 合并后的数据已保存至: ${outputPath}`); console.log(`📁 合并后的数据已保存至: ${outputPath}`);
@ -113,17 +102,15 @@ async function main() {
try { try {
// 步骤1: 获取项目数据 // 步骤1: 获取项目数据
const projectCwd = path.join(process.cwd(), 'Project'); const projectDataPath = path.join(process.cwd(), 'data', 'project.json');
const projectDataPath = path.join(projectCwd, 'data.json'); const projectSuccess = await runScript(getProject, '项目信息抓取脚本', projectDataPath);
const projectSuccess = await runScript(projectMain, '项目信息抓取脚本', projectCwd, projectDataPath);
if (!projectSuccess) { if (!projectSuccess) {
throw new Error('项目信息抓取失败'); throw new Error('项目信息抓取失败');
} }
// 步骤2: 获取许可证数据 // 步骤2: 获取许可证数据
const licenseCwd = path.join(process.cwd(), 'PreSaleLicense'); const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json');
const licenseDataPath = path.join(licenseCwd, 'data.json'); const licenseSuccess = await runScript(getPreSaleLicense, '预售许可证抓取脚本', licenseDataPath);
const licenseSuccess = await runScript(licenseMain, '预售许可证抓取脚本', licenseCwd, licenseDataPath);
if (!licenseSuccess) { if (!licenseSuccess) {
throw new Error('预售许可证抓取失败'); throw new Error('预售许可证抓取失败');
} }
@ -132,7 +119,7 @@ async function main() {
await mergeLicenseData(); await mergeLicenseData();
console.log('\n🎉 所有数据处理完成!'); console.log('\n🎉 所有数据处理完成!');
console.log('📁 输出文件: merged_data.json'); console.log('📁 输出文件: data/merged_data.json');
} catch (error) { } catch (error) {
console.error('\n💥 处理过程中发生错误:', error.message); console.error('\n💥 处理过程中发生错误:', error.message);