From 4b5cf56dd605ae3df53b268c5c1eca4a7b5ea963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=A6=E7=A7=8B=E6=97=AD?= Date: Thu, 22 Jan 2026 10:10:57 +0800 Subject: [PATCH] refactor --- merged_data.json => data/merged_data.json | 4 +- .../data.json => data/preSaleLicense.json | 2 +- Project/data.json => data/project.json | 2 +- package.json | 6 +- scripts/axios.js | 10 +++ scripts/clear.js | 30 +++++++ scripts/generateCsv.js | 29 ++++++ scripts/generateXlsx.js | 43 +++++++++ .../index.js => scripts/getPreSaleLicense.js | 88 ++----------------- Project/index.js => scripts/getProject.js | 26 ++---- index.js => scripts/index.js | 35 +++----- 11 files changed, 143 insertions(+), 132 deletions(-) rename merged_data.json => data/merged_data.json (99%) rename PreSaleLicense/data.json => data/preSaleLicense.json (99%) rename Project/data.json => data/project.json (99%) create mode 100644 scripts/axios.js create mode 100644 scripts/clear.js create mode 100644 scripts/generateCsv.js create mode 100644 scripts/generateXlsx.js rename PreSaleLicense/index.js => scripts/getPreSaleLicense.js (68%) rename Project/index.js => scripts/getProject.js (90%) rename index.js => scripts/index.js (74%) diff --git a/merged_data.json b/data/merged_data.json similarity index 99% rename from merged_data.json rename to data/merged_data.json index 1dc700d..e169f0e 100644 --- a/merged_data.json +++ b/data/merged_data.json @@ -4302,7 +4302,7 @@ "楼幢": [ { "楼幢名称": "1幢", - "成交均价": "7274.7", + "成交均价": "7266.38", "bid": "1099705" } ], @@ -4313,7 +4313,7 @@ "批准时间": "2025-10-24", "所在区域": "普宁市", "总套数": 160, - "可售套数": 102, + "可售套数": 101, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062" }, { diff --git a/PreSaleLicense/data.json b/data/preSaleLicense.json similarity index 99% rename from PreSaleLicense/data.json rename to data/preSaleLicense.json index 519e2ea..e6866a9 100644 --- a/PreSaleLicense/data.json +++ b/data/preSaleLicense.json @@ -284,7 +284,7 @@ "批准时间": "2025-10-24", "所在区域": "普宁市", "总套数": 160, - "可售套数": 102, + "可售套数": 101, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?id=1012062" }, { diff --git a/Project/data.json b/data/project.json similarity index 99% rename from Project/data.json rename to data/project.json index cb94bfe..22caa6d 100644 --- a/Project/data.json +++ b/data/project.json @@ -3294,7 +3294,7 @@ "楼幢": [ { "楼幢名称": "1幢", - "成交均价": "7274.7", + "成交均价": "7266.38", "bid": "1099705" } ] diff --git a/package.json b/package.json index d43cf9c..56b886f 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,8 @@ "description": "", "main": "index.js", "scripts": { - "start": "node index.js", - "clear": "rm -f Project/data.json PreSaleLicense/data.json merged_data.json" + "start": "node scripts/index.js", + "clear": "node scripts/clear.js" }, "keywords": [], "author": "", @@ -17,4 +17,4 @@ "exceljs": "^4.4.0" }, "type": "module" -} +} \ No newline at end of file diff --git a/scripts/axios.js b/scripts/axios.js new file mode 100644 index 0000000..ac1b000 --- /dev/null +++ b/scripts/axios.js @@ -0,0 +1,10 @@ +import axios from 'axios'; + +// 创建配置了默认User-Agent的axios实例 +const axiosInstance = axios.create({ + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } +}); + +export default axiosInstance; \ No newline at end of file diff --git a/scripts/clear.js b/scripts/clear.js new file mode 100644 index 0000000..b6268ca --- /dev/null +++ b/scripts/clear.js @@ -0,0 +1,30 @@ +import fs from 'fs/promises'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +async function clearData() { + const dataDir = path.join(__dirname, '..', 'data'); + + try { + // 检查data目录是否存在 + await fs.access(dataDir); + + // 删除data目录及其所有内容 + await fs.rm(dataDir, { recursive: true, force: true }); + + console.log('✅ 已清空 data 目录'); + + } catch (error) { + if (error.code === 'ENOENT') { + console.log('📁 data 目录不存在,无需清理'); + } else { + console.error('❌ 清理失败:', error.message); + process.exit(1); + } + } +} + +clearData(); \ No newline at end of file diff --git a/scripts/generateCsv.js b/scripts/generateCsv.js new file mode 100644 index 0000000..89df46f --- /dev/null +++ b/scripts/generateCsv.js @@ -0,0 +1,29 @@ +import fs from 'fs/promises'; + +/** + * Generates a CSV file from the given data. + * @param {Array} data - The data to convert to CSV. + * @param {string} filePath - The path to save the CSV file. + */ + +async function generateCsv(data, filePath) { + if (data.length === 0) { + console.log(`没有数据可生成 CSV 文件 (${filePath})。`); + return; + } + const headers = Object.keys(data[0]); + const csvRows = []; + csvRows.push(headers.map(header => `"${header}"`).join(',')); + for (const record of data) { + const values = headers.map(header => { + const value = record[header]; + // Ensure values are properly quoted and internal quotes are escaped + return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`; + }); + csvRows.push(values.join(',')); + } + const csvContent = csvRows.join('\n'); + const BOM = '\uFEFF'; // UTF-8 BOM + await fs.writeFile(filePath, BOM + csvContent, 'utf-8'); + console.log(`已生成 ${filePath} 文件。`); +} diff --git a/scripts/generateXlsx.js b/scripts/generateXlsx.js new file mode 100644 index 0000000..e734772 --- /dev/null +++ b/scripts/generateXlsx.js @@ -0,0 +1,43 @@ +import ExcelJS from 'exceljs'; + +/** + * Generates an XLSX file from the given data. + * @param {Array} data - The data to convert to XLSX. + * @param {string} filePath - The path to save the XLSX file. + */ + +async function generateXlsx(data, filePath) { + if (data.length === 0) { + console.log(`没有数据可生成 XLSX 文件 (${filePath})。`); + return; + } + const workbook = new ExcelJS.Workbook(); + const worksheet = workbook.addWorksheet('Data'); + + const headers = Object.keys(data[0]); + worksheet.columns = headers.map(key => ({ + header: key, + key: key, + width: key.includes('地址') || key.includes('链接') ? 40 : 20 + })); + worksheet.addRows(data); + + worksheet.getRow(1).eachCell(cell => { + cell.font = { bold: true }; + cell.fill = { + type: 'pattern', + pattern: 'solid', + fgColor: { argb: 'FFDDDDDD' } + }; + cell.alignment = { vertical: 'middle', horizontal: 'center' }; + }); + worksheet.autoFilter = { + from: 'A1', + to: { + row: 1, + column: headers.length + } + }; + await workbook.xlsx.writeFile(filePath); + console.log(`已生成 ${filePath} 文件。`); +} diff --git a/PreSaleLicense/index.js b/scripts/getPreSaleLicense.js similarity index 68% rename from PreSaleLicense/index.js rename to scripts/getPreSaleLicense.js index a83b8f0..43dfe6e 100644 --- a/PreSaleLicense/index.js +++ b/scripts/getPreSaleLicense.js @@ -1,7 +1,7 @@ -import axios from 'axios'; +import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; -import ExcelJS from 'exceljs'; // Import ExcelJS for XLSX generation +import path from 'path'; const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'presellCertList.aspx'; @@ -57,11 +57,7 @@ function extractDataFromHtml($) { async function scrapeWebsite() { let allData = []; console.log('开始抓取第一页数据...'); - let response = await axios.get(START_URL, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - }); + let response = await axios.get(START_URL); let $ = cheerio.load(response.data); allData = extractDataFromHtml($); @@ -108,7 +104,6 @@ async function scrapeWebsite() { response = await axios.post(START_URL, postData, { headers: { 'Content-Type': 'application/x-www-form-urlencoded', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': START_URL, } }); @@ -157,75 +152,6 @@ async function processScrapedData(allData) { return processedData; } -/** - * Generates a CSV file from the given data. - * @param {Array} data - The data to convert to CSV. - * @param {string} filePath - The path to save the CSV file. - */ -async function generateCsv(data, filePath) { - if (data.length === 0) { - console.log(`没有数据可生成 CSV 文件 (${filePath})。`); - return; - } - const headers = Object.keys(data[0]); - const csvRows = []; - csvRows.push(headers.map(header => `"${header}"`).join(',')); - for (const record of data) { - const values = headers.map(header => { - const value = record[header]; - // Ensure values are properly quoted and internal quotes are escaped - return `"${String(value).replace(new RegExp('"', 'g'), '""')}"`; - }); - csvRows.push(values.join(',')); - } - const csvContent = csvRows.join('\n'); - const BOM = '\uFEFF'; // UTF-8 BOM - await fs.writeFile(filePath, BOM + csvContent, 'utf-8'); - console.log(`已生成 ${filePath} 文件。`); -} - -/** - * Generates an XLSX file from the given data. - * @param {Array} data - The data to convert to XLSX. - * @param {string} filePath - The path to save the XLSX file. - */ -async function generateXlsx(data, filePath) { - if (data.length === 0) { - console.log(`没有数据可生成 XLSX 文件 (${filePath})。`); - return; - } - const workbook = new ExcelJS.Workbook(); - const worksheet = workbook.addWorksheet('Data'); - - const headers = Object.keys(data[0]); - worksheet.columns = headers.map(key => ({ - header: key, - key: key, - width: key.includes('地址') || key.includes('链接') ? 40 : 20 - })); - worksheet.addRows(data); - - worksheet.getRow(1).eachCell(cell => { - cell.font = { bold: true }; - cell.fill = { - type: 'pattern', - pattern:'solid', - fgColor:{argb:'FFDDDDDD'} - }; - cell.alignment = { vertical: 'middle', horizontal: 'center' }; - }); - worksheet.autoFilter = { - from: 'A1', - to: { - row: 1, - column: headers.length - } - }; - await workbook.xlsx.writeFile(filePath); - console.log(`已生成 ${filePath} 文件。`); -} - - // 主函数 - 导出以便在根目录调用 export async function main() { try { @@ -233,11 +159,9 @@ export async function main() { const processedData = await processScrapedData(allData); - await fs.writeFile('data.json', JSON.stringify(processedData, null, 4), 'utf-8'); - console.log(`更新后的数据已保存至 data.json 文件。`); - - // await generateCsv(processedData, './data.csv'); - // await generateXlsx(processedData, './data.xlsx'); + const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json'); + await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); + console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`); console.log('\n所有数据处理和文件生成任务已完成。'); diff --git a/Project/index.js b/scripts/getProject.js similarity index 90% rename from Project/index.js rename to scripts/getProject.js index dca5d20..9e5b24b 100644 --- a/Project/index.js +++ b/scripts/getProject.js @@ -1,4 +1,4 @@ -import axios from 'axios'; +import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; @@ -27,10 +27,7 @@ async function getEncryptedUrl(relativeUrl, origin) { const path = urlParts[0]; const queryString = urlParts[1]; - const xmlPayload = ` - -${xmlEncode(queryString)} -`; + const xmlPayload = ` ${xmlEncode(queryString)} `; const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href; @@ -38,7 +35,6 @@ async function getEncryptedUrl(relativeUrl, origin) { const response = await axios.post(encryptionUrl, xmlPayload, { headers: { 'Content-Type': 'application/xml', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', } }); const encryptedQuery = response.data; @@ -98,11 +94,7 @@ async function scrapeProjectDetails(project) { try { console.log(`正在抓取详情: ${project['项目名称']}`); - const response = await axios.get(project['项目链接'], { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - }); + const response = await axios.get(project['项目链接']); const $ = cheerio.load(response.data); // 抓取 housedetail 表格中的所有信息 @@ -177,11 +169,7 @@ async function scrapeWebsite() { console.log('开始抓取第一页数据...'); const origin = new URL(START_URL).origin; - let response = await axios.get(START_URL, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - }); + let response = await axios.get(START_URL); let $ = cheerio.load(response.data); let firstPageData = await extractDataFromHtml($, origin); @@ -233,7 +221,6 @@ async function scrapeWebsite() { response = await axios.post(START_URL, postData, { headers: { 'Content-Type': 'application/x-www-form-urlencoded', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': START_URL, } }); @@ -282,8 +269,9 @@ export async function main() { const processedData = await processScrapedData(allData); // 保存为JSON文件 - await fs.writeFile(path.join(__dirname, 'data.json'), JSON.stringify(processedData, null, 4), 'utf-8'); - console.log('项目数据已保存至 data.json 文件。'); + const dataPath = path.join(__dirname, '..', 'data', 'project.json'); + await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); + console.log('项目数据已保存至 data/project.json 文件。'); console.log('\n所有数据抓取和处理任务已完成。'); diff --git a/index.js b/scripts/index.js similarity index 74% rename from index.js rename to scripts/index.js index 98bd48f..ae0e82e 100644 --- a/index.js +++ b/scripts/index.js @@ -1,7 +1,7 @@ import fs from 'fs/promises'; import path from 'path'; -import { main as projectMain } from './Project/index.js'; -import { main as licenseMain } from './PreSaleLicense/index.js'; +import { main as getProject } from './getProject.js'; +import { main as getPreSaleLicense } from './getPreSaleLicense.js'; // 检查数据文件是否存在 async function checkDataFileExists(dataPath) { @@ -14,7 +14,7 @@ async function checkDataFileExists(dataPath) { } // 执行脚本函数并等待完成 -async function runScript(scriptFunction, description, cwd, dataFilePath) { +async function runScript(scriptFunction, description, dataFilePath) { console.log(`开始执行: ${description}`); // 检查数据文件是否已存在 @@ -25,19 +25,8 @@ async function runScript(scriptFunction, description, cwd, dataFilePath) { } try { - // 临时改变工作目录 - const originalCwd = process.cwd(); - if (cwd) { - process.chdir(cwd); - } - await scriptFunction(); - // 恢复原始工作目录 - if (cwd) { - process.chdir(originalCwd); - } - console.log(`✅ ${description} 执行完成\n`); return true; } catch (error) { @@ -52,11 +41,11 @@ async function mergeLicenseData() { try { // 读取项目数据 - const projectDataPath = path.join(process.cwd(), 'Project', 'data.json'); + const projectDataPath = path.join(process.cwd(), 'data', 'project.json'); const projectData = JSON.parse(await fs.readFile(projectDataPath, 'utf-8')); // 读取许可证数据 - const licenseDataPath = path.join(process.cwd(), 'PreSaleLicense', 'data.json'); + const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json'); const licenseData = JSON.parse(await fs.readFile(licenseDataPath, 'utf-8')); // 创建许可证号到许可证数据的映射 @@ -93,7 +82,7 @@ async function mergeLicenseData() { console.log(`✅ 成功合并 ${mergedCount} 个许可证详情`); // 保存合并后的数据 - const outputPath = path.join(process.cwd(), 'merged_data.json'); + const outputPath = path.join(process.cwd(), 'data', 'merged_data.json'); await fs.writeFile(outputPath, JSON.stringify(projectData, null, 4), 'utf-8'); console.log(`📁 合并后的数据已保存至: ${outputPath}`); @@ -113,17 +102,15 @@ async function main() { try { // 步骤1: 获取项目数据 - const projectCwd = path.join(process.cwd(), 'Project'); - const projectDataPath = path.join(projectCwd, 'data.json'); - const projectSuccess = await runScript(projectMain, '项目信息抓取脚本', projectCwd, projectDataPath); + const projectDataPath = path.join(process.cwd(), 'data', 'project.json'); + const projectSuccess = await runScript(getProject, '项目信息抓取脚本', projectDataPath); if (!projectSuccess) { throw new Error('项目信息抓取失败'); } // 步骤2: 获取许可证数据 - const licenseCwd = path.join(process.cwd(), 'PreSaleLicense'); - const licenseDataPath = path.join(licenseCwd, 'data.json'); - const licenseSuccess = await runScript(licenseMain, '预售许可证抓取脚本', licenseCwd, licenseDataPath); + const licenseDataPath = path.join(process.cwd(), 'data', 'preSaleLicense.json'); + const licenseSuccess = await runScript(getPreSaleLicense, '预售许可证抓取脚本', licenseDataPath); if (!licenseSuccess) { throw new Error('预售许可证抓取失败'); } @@ -132,7 +119,7 @@ async function main() { await mergeLicenseData(); console.log('\n🎉 所有数据处理完成!'); - console.log('📁 输出文件: merged_data.json'); + console.log('📁 输出文件: data/merged_data.json'); } catch (error) { console.error('\n💥 处理过程中发生错误:', error.message);