From f11b997db411912922749b6b905a65cf09b01264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=A6=E7=A7=8B=E6=97=AD?= Date: Thu, 22 Jan 2026 15:44:57 +0800 Subject: [PATCH] scraperUtils --- data/merged_data.json | 40 ++++++------ data/preSaleLicense.json | 10 +-- data/project.json | 10 +-- scripts/getCompanies.js | 111 +++------------------------------ scripts/getPreSaleLicense.js | 96 +++-------------------------- scripts/getProject.js | 116 ++++------------------------------- scripts/scraperUtils.js | 103 +++++++++++++++++++++++++++++++ 7 files changed, 162 insertions(+), 324 deletions(-) create mode 100644 scripts/scraperUtils.js diff --git a/data/merged_data.json b/data/merged_data.json index 0954cd9..564dd11 100644 --- a/data/merged_data.json +++ b/data/merged_data.json @@ -174,8 +174,8 @@ "资质等级": "二级", "核准预售套数": 236, "核准预售面积": "0", - "已售总套数": 69, - "未售总套数": 167, + "已售总套数": 70, + "未售总套数": 166, "已售总面积": "0", "未售总面积": "0", "楼盘销售部地址": "星河明珠湾营销中心", @@ -337,7 +337,7 @@ "楼幢": [ { "楼幢名称": "C2幢", - "成交均价": "14476.55", + "成交均价": "13967.14", "bid": "1099728" }, { @@ -357,7 +357,7 @@ "批准时间": "2025-12-05", "所在区域": "普宁市", "总套数": 109, - "可售套数": 90, + "可售套数": 89, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D" } ] @@ -895,8 +895,8 @@ "资质等级": "二级", "核准预售套数": 1186, "核准预售面积": "0", - "已售总套数": 844, - "未售总套数": 342, + "已售总套数": 845, + "未售总套数": 341, "已售总面积": "0", "未售总面积": "0", "楼盘销售部地址": "星河明珠湾花园", @@ -919,7 +919,7 @@ }, { "楼幢名称": "B7幢", - "成交均价": "8468.21", + "成交均价": "8464.22", "bid": "1099699" } ], @@ -930,7 +930,7 @@ "批准时间": "2026-01-21", "所在区域": "普宁市", "总套数": 114, - "可售套数": 56, + "可售套数": 55, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D" }, { @@ -2123,8 +2123,8 @@ "资质等级": "二级", "核准预售套数": 2114, "核准预售面积": "0", - "已售总套数": 1817, - "未售总套数": 297, + "已售总套数": 1818, + "未售总套数": 296, "已售总面积": "0", "未售总面积": "0", "楼盘销售部地址": "", @@ -2450,7 +2450,7 @@ }, { "楼幢名称": "14幢", - "成交均价": "6356.32", + "成交均价": "6512.01", "bid": "1099566" } ], @@ -2461,7 +2461,7 @@ "批准时间": "2025-12-02", "所在区域": "普宁市", "总套数": 301, - "可售套数": 176, + "可售套数": 175, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D" } ] @@ -4197,8 +4197,8 @@ "资质等级": "二级", "核准预售套数": 1118, "核准预售面积": "0", - "已售总套数": 992, - "未售总套数": 126, + "已售总套数": 994, + "未售总套数": 124, "已售总面积": "0", "未售总面积": "0", "楼盘销售部地址": "普宁市北二环大道与铁山兰路交汇处", @@ -4302,7 +4302,7 @@ "楼幢": [ { "楼幢名称": "1幢", - "成交均价": "7266.38", + "成交均价": "7270.26", "bid": "1099705" } ], @@ -4313,7 +4313,7 @@ "批准时间": "2025-10-24", "所在区域": "普宁市", "总套数": 160, - "可售套数": 101, + "可售套数": 99, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D" }, { @@ -10812,8 +10812,8 @@ "资质等级": "暂定资质", "核准预售套数": 398, "核准预售面积": "0", - "已售总套数": 407, - "未售总套数": -9, + "已售总套数": 405, + "未售总套数": -7, "已售总面积": "0", "未售总面积": "0", "楼盘销售部地址": "", @@ -10826,7 +10826,7 @@ "楼幢": [ { "楼幢名称": "1号楼", - "成交均价": "6168.59", + "成交均价": "6176.77", "bid": "-664" }, { @@ -10847,7 +10847,7 @@ "批准时间": "2020-08-20", "所在区域": "普宁市", "总套数": 398, - "可售套数": -9, + "可售套数": -7, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D" } ] diff --git a/data/preSaleLicense.json b/data/preSaleLicense.json index 7c55b91..784405b 100644 --- a/data/preSaleLicense.json +++ b/data/preSaleLicense.json @@ -32,7 +32,7 @@ "批准时间": "2026-01-21", "所在区域": "普宁市", "总套数": 114, - "可售套数": 56, + "可售套数": 55, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?atcXVrdhLqFDsVq8ylD0hw%3D%3D" }, { @@ -152,7 +152,7 @@ "批准时间": "2025-12-05", "所在区域": "普宁市", "总套数": 109, - "可售套数": 90, + "可售套数": 89, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVcRqLE4C4aQ%3D%3D" }, { @@ -212,7 +212,7 @@ "批准时间": "2025-12-02", "所在区域": "普宁市", "总套数": 301, - "可售套数": 176, + "可售套数": 175, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDqVhs6gdwhuMw%3D%3D" }, { @@ -284,7 +284,7 @@ "批准时间": "2025-10-24", "所在区域": "普宁市", "总套数": 160, - "可售套数": 101, + "可售套数": 99, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?d0BkXtgEYDq%2FwD%2FQObW7NQ%3D%3D" }, { @@ -1628,7 +1628,7 @@ "批准时间": "2020-08-20", "所在区域": "普宁市", "总套数": 398, - "可售套数": -9, + "可售套数": -7, "许可证链接": "http://120.236.48.169:89/HPMS/PresellDetailsInfo.aspx?%2FJnZZ%2FURSYFPzGDwdoVynw%3D%3D" }, { diff --git a/data/project.json b/data/project.json index 73302e4..2176d07 100644 --- a/data/project.json +++ b/data/project.json @@ -247,7 +247,7 @@ "楼幢": [ { "楼幢名称": "C2幢", - "成交均价": "14476.55", + "成交均价": "13967.14", "bid": "1099728" }, { @@ -730,7 +730,7 @@ }, { "楼幢名称": "B7幢", - "成交均价": "8468.21", + "成交均价": "8464.22", "bid": "1099699" } ] @@ -1892,7 +1892,7 @@ }, { "楼幢名称": "14幢", - "成交均价": "6356.32", + "成交均价": "6512.01", "bid": "1099566" } ] @@ -3294,7 +3294,7 @@ "楼幢": [ { "楼幢名称": "1幢", - "成交均价": "7266.38", + "成交均价": "7270.26", "bid": "1099705" } ] @@ -8837,7 +8837,7 @@ "楼幢": [ { "楼幢名称": "1号楼", - "成交均价": "6168.59", + "成交均价": "6176.77", "bid": "-664" }, { diff --git a/scripts/getCompanies.js b/scripts/getCompanies.js index 91bdf08..f252176 100644 --- a/scripts/getCompanies.js +++ b/scripts/getCompanies.js @@ -2,11 +2,12 @@ import axios from './axios.js'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; -import { fileURLToPath } from 'url'; // Add fileURLToPath import -import { getEncryptedUrl } from './getEncryptedUrl.js'; // Import shared encryption function +import { fileURLToPath } from 'url'; +import { getEncryptedUrl } from './getEncryptedUrl.js'; +import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils -const __filename = fileURLToPath(import.meta.url); // Define __filename -const __dirname = path.dirname(__filename); // Define __dirname +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); const BASE_URL = 'http://120.236.48.169:89/HEMS/'; const START_URL = BASE_URL + 'CompanyList.aspx'; @@ -52,111 +53,13 @@ async function extractDataFromHtml($, origin) { return data.filter(item => item !== null); } -/** - * Scrapes all company data from the target website, handling pagination. - * @returns {Promise>} - An array of all scraped data records. - */ -async function scrapeWebsite() { - let allData = []; - const origin = new URL(START_URL).origin; - - console.log('开始抓取公司列表第一页数据...'); - let response = await axios.get(START_URL); - - let $ = cheerio.load(response.data); - let firstPageData = await extractDataFromHtml($, origin); - allData = allData.concat(firstPageData); - console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`); - - // Get total pages for pagination - const totalRecordsSpan = $('#PageNavigator1_LblRecordCount'); - const totalPagesSpan = $('#PageNavigator1_LblPageCount'); - - const totalRecords = totalRecordsSpan.length ? parseInt(totalRecordsSpan.text(), 10) : 0; - const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1; - console.log(`共发现 ${totalRecords} 条记录,分为 ${totalPages} 页。`); - - // Collect form data for POST requests - const formValues = {}; - $('input[name^="txt"], select').each((idx, el) => { - const name = $(el).attr('name'); - if (name) { - formValues[name] = $(el).val() || ''; - } - }); - - // Scrape remaining pages - for (let i = 2; i <= totalPages; i++) { - console.log(`正在抓取第 ${i} 页...`); - - const viewState = $('#__VIEWSTATE').val(); - const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); - const eventValidation = $('#__EVENTVALIDATION').val(); - - if (!viewState) { - console.log('无法找到 __VIEWSTATE,终止抓取。'); - break; - } - - const postData = new URLSearchParams(); - postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); - postData.append('__EVENTARGUMENT', ''); - postData.append('__VIEWSTATE', viewState); - postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); - if (eventValidation) { - postData.append('__EVENTVALIDATION', eventValidation); - } - - // Add form fields - for (const name in formValues) { - postData.append(name, formValues[name]); - } - postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); - - response = await axios.post(START_URL, postData, { - headers: { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': START_URL, - } - }); - - $ = cheerio.load(response.data); - const nextPageData = await extractDataFromHtml($, origin); - console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); - - if (nextPageData.length === 0) { - console.log(`第 ${i} 页没有数据,抓取结束。`); - break; - } - allData = allData.concat(nextPageData); - } - - console.log(`\n抓取全站公司数据完毕!共 ${allData.length} 条原始记录。`); - return allData; -} - -/** - * Performs post-processing on the scraped data: - * 1. Re-indexes "序号" sequentially. - * @param {Array} allData - The raw scraped data. - * @returns {Array} - The processed data. - */ -function processScrapedData(allData) { - // Re-index "序号" sequentially - for (let i = 0; i < allData.length; i++) { - allData[i]['序号'] = (i + 1).toString(); - } - console.log('序号字段已重新编号。'); - - return allData; -} // Main function to orchestrate the scraping and saving process export async function main() { try { console.log('🚀 开始抓取普宁房地产开发企业列表...'); - const allData = await scrapeWebsite(); - const processedData = processScrapedData(allData); + const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination + const processedData = reIndex(allData); // Use generic re-indexing const dataPath = path.join(__dirname, '..', 'data', 'companies.json'); await fs.mkdir(path.dirname(dataPath), { recursive: true }); // Ensure data directory exists diff --git a/scripts/getPreSaleLicense.js b/scripts/getPreSaleLicense.js index 118dcc1..649899d 100644 --- a/scripts/getPreSaleLicense.js +++ b/scripts/getPreSaleLicense.js @@ -3,6 +3,7 @@ import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import { getEncryptedUrl } from './getEncryptedUrl.js'; +import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const START_URL = BASE_URL + 'presellCertList.aspx'; @@ -48,99 +49,21 @@ async function extractDataFromHtml($, origin) { return data.filter(item => item !== null); } -/** - * Scrapes all data from the target website, handling pagination. - * @returns {Promise>} - An array of all scraped data records. - */ -async function scrapeWebsite() { - let allData = []; - const origin = new URL(BASE_URL).origin; - - console.log('开始抓取第一页数据...'); - let response = await axios.get(START_URL); - - let $ = cheerio.load(response.data); - allData = await extractDataFromHtml($, origin); - console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`); - - const pageCountSpan = $('#PageNavigator1_LblPageCount'); - const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1; - console.log(`共发现 ${totalPages} 页。`); - - const formValues = {}; - $('input[name^="txt"], select').each((idx, el) => { - const name = $(el).attr('name'); - if (name) { - formValues[name] = $(el).val() || ''; - } - }); - - for (let i = 2; i <= totalPages; i++) { - console.log(`正在抓取第 ${i} 页...`); - - const viewState = $('#__VIEWSTATE').val(); - const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); - const eventValidation = $('#__EVENTVALIDATION').val(); - - if (!viewState) { - console.log('无法找到 __VIEWSTATE,终止抓取。'); - break; - } - - const postData = new URLSearchParams(); - postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); - postData.append('__EVENTARGUMENT', ''); - postData.append('__VIEWSTATE', viewState); - postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); - if(eventValidation) { - postData.append('__EVENTVALIDATION', eventValidation); - } - - for (const name in formValues) { - postData.append(name, formValues[name]); - } - postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); - - response = await axios.post(START_URL, postData, { - headers: { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': START_URL, - } - }); - - $ = cheerio.load(response.data); - const nextPageData = await extractDataFromHtml($, origin); - console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); - - if (nextPageData.length === 0) { - console.log(`第 ${i} 页没有数据,抓取结束。`); - break; - } - allData = allData.concat(nextPageData); - } - console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); - return allData; -} - - /** * Performs post-processing on the scraped data: * 1. Filters out records where "许可证号" is "空". - * 2. Re-indexes "序号" sequentially. + * 2. Re-indexes "序号" sequentially using the utility function. * 3. Converts "总套数" and "可售套数" fields to numbers. * @param {Array} allData - The raw scraped data. * @returns {Promise>} - The processed data. */ async function processScrapedData(allData) { - // 1. Filter out records where "许可证号" is "空" - let processedData = allData.filter(record => record['许可证号'] !== '空'); - console.log(`删除 "许可证号" 为 "空" 的记录后,剩余 ${processedData.length} 条记录。`); + // 1. Filter out records where "许可证号" is "空" or null + let processedData = allData.filter(record => record['许可证号'] !== '空' && record['许可证号']); + console.log(`删除无效许可证记录后,剩余 ${processedData.length} 条记录。`); // 2. Re-index "序号" sequentially - for (let i = 0; i < processedData.length; i++) { - processedData[i]['序号'] = (i + 1).toString(); - } - console.log('序号字段已重新编号。'); + reIndex(processedData); // Use the shared utility function // 3. Convert "总套数" and "可售套数" to numbers for (const record of processedData) { @@ -155,14 +78,15 @@ async function processScrapedData(allData) { // 主函数 - 导出以便在根目录调用 export async function main() { try { - const allData = await scrapeWebsite(); + console.log('🚀 开始抓取预售许可证数据...'); + const allData = await scrapePaginatedData(START_URL, extractDataFromHtml); // Use generic pagination const processedData = await processScrapedData(allData); const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json'); await fs.mkdir(path.dirname(dataPath), { recursive: true }); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); - console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`); + console.log(`✅ 更新后的数据已保存至 data/preSaleLicense.json 文件。`); console.log('\n所有数据处理和文件生成任务已完成。'); @@ -173,4 +97,4 @@ export async function main() { } throw error; // Re-throw the error to be caught by the caller } -} +} \ No newline at end of file diff --git a/scripts/getProject.js b/scripts/getProject.js index 41580a8..af1bdf9 100644 --- a/scripts/getProject.js +++ b/scripts/getProject.js @@ -5,6 +5,7 @@ import path from 'path'; import { fileURLToPath } from 'url'; import { getEncryptedUrl } from './getEncryptedUrl.js'; +import { scrapePaginatedData, reIndex } from './scraperUtils.js'; // Import new utils const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -122,113 +123,20 @@ async function scrapeProjectDetails(project) { } -/** - * 抓取网站所有数据,处理分页 - * @returns {Promise>} - 所有抓取的数据记录数组 - */ -async function scrapeWebsite() { - let allData = []; - console.log('开始抓取第一页数据...'); - const origin = new URL(START_URL).origin; - - let response = await axios.get(START_URL); - - let $ = cheerio.load(response.data); - let firstPageData = await extractDataFromHtml($, origin); - allData = allData.concat(firstPageData); - console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`); - - // 获取总页数 - const pageCountSpan = $('#PageNavigator1_LblPageCount'); - const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1; - console.log(`共发现 ${totalPages} 页。`); - - // 收集表单数据,用于POST请求 - const formValues = {}; - $('input[name^="txt"], select').each((idx, el) => { - const name = $(el).attr('name'); - if (name) { - formValues[name] = $(el).val() || ''; - } - }); - - // 抓取其余页面 - for (let i = 2; i <= totalPages; i++) { - console.log(`正在抓取第 ${i} 页...`); - - const viewState = $('#__VIEWSTATE').val(); - const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); - const eventValidation = $('#__EVENTVALIDATION').val(); - - if (!viewState) { - console.log('无法找到 __VIEWSTATE,终止抓取。'); - break; - } - - const postData = new URLSearchParams(); - postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); - postData.append('__EVENTARGUMENT', ''); - postData.append('__VIEWSTATE', viewState); - postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); - if (eventValidation) { - postData.append('__EVENTVALIDATION', eventValidation); - } - - // 添加表单字段 - for (const name in formValues) { - postData.append(name, formValues[name]); - } - postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); - - response = await axios.post(START_URL, postData, { - headers: { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': START_URL, - } - }); - - $ = cheerio.load(response.data); - const nextPageData = await extractDataFromHtml($, origin); - console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); - - if (nextPageData.length === 0) { - console.log(`第 ${i} 页没有数据,抓取结束。`); - break; - } - allData = allData.concat(nextPageData); - } - - console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); - - console.log('\n开始抓取项目详情...'); - const detailedDataPromises = allData.map(project => scrapeProjectDetails(project)); - const detailedData = await Promise.all(detailedDataPromises); - console.log('所有项目详情抓取完毕。'); - - return detailedData; -} - -/** - * 对抓取的数据进行后处理: - * 1. 重新编号序号字段 - * @param {Array} allData - 原始抓取数据 - * @returns {Promise>} - 处理后的数据 - */ -async function processScrapedData(allData) { - // 重新编号序号字段 - for (let i = 0; i < allData.length; i++) { - allData[i]['序号'] = (i + 1).toString(); - } - console.log('序号字段已重新编号。'); - - return allData; -} +// The old scrapeWebsite function is removed. // 主函数 - 导出以便在根目录调用 export async function main() { try { - const allData = await scrapeWebsite(); - const processedData = await processScrapedData(allData); + console.log('🚀 开始抓取项目数据...'); + const projectList = await scrapePaginatedData(START_URL, extractDataFromHtml); + + console.log('\n开始抓取项目详情...'); + const detailedDataPromises = projectList.map(project => scrapeProjectDetails(project)); + const allData = await Promise.all(detailedDataPromises); + console.log('所有项目详情抓取完毕。'); + + const processedData = reIndex(allData); // Use generic re-indexing // 保存为JSON文件 const dataPath = path.join(__dirname, '..', 'data', 'project.json'); @@ -245,4 +153,4 @@ export async function main() { } throw error; // Re-throw the error to be caught by the caller } -} \ No newline at end of file +} diff --git a/scripts/scraperUtils.js b/scripts/scraperUtils.js new file mode 100644 index 0000000..f85d927 --- /dev/null +++ b/scripts/scraperUtils.js @@ -0,0 +1,103 @@ +import axios from './axios.js'; +import * as cheerio from 'cheerio'; + +/** + * A generic function to scrape paginated data from the target ASP.NET website. + * @param {string} startUrl - The initial URL to begin scraping. + * @param {Function} extractDataFromHtml - The page-specific function to extract data from Cheerio object. + * @returns {Promise>} - An array of all scraped data records. + */ +export async function scrapePaginatedData(startUrl, extractDataFromHtml) { + let allData = []; + const origin = new URL(startUrl).origin; + + console.log(`开始抓取第一页数据: ${startUrl}`); + let response = await axios.get(startUrl); + let $ = cheerio.load(response.data); + + let firstPageData = await extractDataFromHtml($, origin); + allData = allData.concat(firstPageData); + console.log(`第一页抓取完成,获得 ${firstPageData.length} 条数据。`); + + // Get total pages for pagination + const totalPagesSpan = $('#PageNavigator1_LblPageCount'); + const totalPages = totalPagesSpan.length ? parseInt(totalPagesSpan.text(), 10) : 1; + console.log(`共发现 ${totalPages} 页。`); + + if (totalPages <= 1) { + console.log('\n抓取全站数据完毕!'); + return allData; + } + + // Collect form data for POST requests + const formValues = {}; + $('input[name^="txt"], select').each((idx, el) => { + const name = $(el).attr('name'); + if (name) { + formValues[name] = $(el).val() || ''; + } + }); + + // Scrape remaining pages + for (let i = 2; i <= totalPages; i++) { + console.log(`正在抓取第 ${i} 页...`); + + const viewState = $('#__VIEWSTATE').val(); + const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val(); + const eventValidation = $('#__EVENTVALIDATION').val(); + + if (!viewState) { + console.log('无法找到 __VIEWSTATE,终止抓取。'); + break; + } + + const postData = new URLSearchParams(); + postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext'); + postData.append('__EVENTARGUMENT', ''); + postData.append('__VIEWSTATE', viewState); + postData.append('__VIEWSTATEGENERATOR', viewStateGenerator); + if (eventValidation) { + postData.append('__EVENTVALIDATION', eventValidation); + } + + for (const name in formValues) { + postData.append(name, formValues[name]); + } + postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString()); + + response = await axios.post(startUrl, postData, { + headers: { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': startUrl, + } + }); + + $ = cheerio.load(response.data); + const nextPageData = await extractDataFromHtml($, origin); + console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); + + if (nextPageData.length === 0) { + console.log(`第 ${i} 页没有数据,抓取结束。`); + break; + } + allData = allData.concat(nextPageData); + } + + console.log(`\n抓取全站数据完毕!共 ${allData.length} 条原始记录。`); + return allData; +} + +/** + * Re-indexes the "序号" field of each record in an array sequentially, starting from 1. + * @param {Array} dataArray - The array of data to re-index. + * @returns {Array} - The re-indexed data array. + */ +export function reIndex(dataArray) { + if (!dataArray || dataArray.length === 0) return []; + + for (let i = 0; i < dataArray.length; i++) { + dataArray[i]['序号'] = (i + 1).toString(); + } + console.log('序号字段已重新编号。'); + return dataArray; +}