getEncryptedUrl

This commit is contained in:
秦秋旭 2026-01-22 14:40:42 +08:00
parent 2b5a602b49
commit 8ff37476f7
6 changed files with 578 additions and 565 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -347,7 +347,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "2幢", "楼幢名称": "2幢",
"成交均价": "6408.55", "成交均价": "6356.11",
"bid": "1099666" "bid": "1099666"
}, },
{ {
@ -582,7 +582,7 @@
}, },
{ {
"楼幢名称": "4幢", "楼幢名称": "4幢",
"成交均价": "8522.14", "成交均价": "8515.79",
"bid": "1099587" "bid": "1099587"
} }
] ]
@ -1942,7 +1942,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "4幢", "楼幢名称": "4幢",
"成交均价": "6328.95", "成交均价": "6328.52",
"bid": "1099714" "bid": "1099714"
} }
] ]
@ -3097,7 +3097,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "1幢", "楼幢名称": "1幢",
"成交均价": "7517.44", "成交均价": "7513.09",
"bid": "1099429" "bid": "1099429"
}, },
{ {
@ -3274,7 +3274,7 @@
"楼幢": [ "楼幢": [
{ {
"楼幢名称": "7幢", "楼幢名称": "7幢",
"成交均价": "7070.37", "成交均价": "7074.62",
"bid": "1099386" "bid": "1099386"
} }
] ]

View File

@ -0,0 +1,47 @@
import axios from './axios.js';
// XML Encode
function xmlEncode(text) {
if (typeof text !== 'string') return '';
return text.replace(/&/g, '&amp;').replace(/</g, '&lt;');
}
/**
* Takes a relative URL from the website and returns a fully formed, encrypted URL.
* @param {string} relativeUrl - The relative URL like 'ProjectDetailsInfo.aspx?a=1&b=2'.
* @param {string} origin - The origin of the website, e.g., 'http://120.236.48.169:89'.
* @returns {Promise<string>} - The full, encrypted URL.
*/
export async function getEncryptedUrl(relativeUrl, origin) {
if (!relativeUrl) {
return '';
}
const urlParts = relativeUrl.split('?');
if (urlParts.length < 2) {
return new URL(relativeUrl, origin).href;
}
const path = urlParts[0];
const queryString = urlParts[1];
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
try {
const response = await axios.post(encryptionUrl, xmlPayload, {
headers: {
'Content-Type': 'application/xml',
}
});
const encryptedQuery = response.data;
if (encryptedQuery) {
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
}
} catch (error) {
console.error(`加密链接失败: ${relativeUrl}`, error.message);
}
// Fallback to original url on error
return new URL(relativeUrl, origin).href;
}

View File

@ -2,20 +2,20 @@ import axios from './axios.js';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import fs from 'fs/promises'; import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { getEncryptedUrl } from './getEncryptedUrl.js';
const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx'; const START_URL = BASE_URL + 'presellCertList.aspx';
// Extracts table data from a given HTML content // Extracts table data from a given HTML content
function extractDataFromHtml($) { async function extractDataFromHtml($, origin) {
const data = []; const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
// Corrected selector to find rows with table data, skipping the header
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td'); const columns = $(row).find('td');
// Based on debug.html, the structure is different and has 9 columns // Based on debug.html, the structure is different and has 9 columns
if (columns.length >= 9) { if (columns.length < 9) {
return null;
}
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
const licenseLinkTag = licenseCell.find('a'); const licenseLinkTag = licenseCell.find('a');
@ -38,16 +38,14 @@ function extractDataFromHtml($) {
// Make regex flexible to handle single or double quotes // Make regex flexible to handle single or double quotes
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/); const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) { if (match && match[1]) {
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110' rowData['许可证链接'] = await getEncryptedUrl(match[1], origin);
// We need to resolve it against the origin, not the full BASE_URL path
const origin = new URL(BASE_URL).origin;
rowData['许可证链接'] = new URL(match[1], origin).href;
} }
} }
data.push(rowData); return rowData;
}
}); });
return data;
const data = await Promise.all(dataPromises);
return data.filter(item => item !== null);
} }
/** /**
@ -56,11 +54,13 @@ function extractDataFromHtml($) {
*/ */
async function scrapeWebsite() { async function scrapeWebsite() {
let allData = []; let allData = [];
const origin = new URL(BASE_URL).origin;
console.log('开始抓取第一页数据...'); console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL); let response = await axios.get(START_URL);
let $ = cheerio.load(response.data); let $ = cheerio.load(response.data);
allData = extractDataFromHtml($); allData = await extractDataFromHtml($, origin);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`); console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
const pageCountSpan = $('#PageNavigator1_LblPageCount'); const pageCountSpan = $('#PageNavigator1_LblPageCount');
@ -109,7 +109,7 @@ async function scrapeWebsite() {
}); });
$ = cheerio.load(response.data); $ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($); const nextPageData = await extractDataFromHtml($, origin);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`); console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) { if (nextPageData.length === 0) {
@ -160,15 +160,17 @@ export async function main() {
const processedData = await processScrapedData(allData); const processedData = await processScrapedData(allData);
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json'); const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true });
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`); console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
console.log('\n所有数据处理和文件生成任务已完成。'); console.log('\n所有数据处理和文件生成任务已完成。');
} catch (error) { } catch (error) {
console.error('抓取或处理过程中发生错误:', error.message); console.error('在 getPreSaleLicense.js 抓取或处理过程中发生错误:', error.message);
if (error.response) { if (error.response) {
console.error('Status:', error.response.status); console.error('Status:', error.response.status);
} }
throw error; // Re-throw the error to be caught by the caller
} }
} }

View File

@ -4,52 +4,14 @@ import fs from 'fs/promises';
import path from 'path'; import path from 'path';
import { fileURLToPath } from 'url'; import { fileURLToPath } from 'url';
import { getEncryptedUrl } from './getEncryptedUrl.js';
const __filename = fileURLToPath(import.meta.url); const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename); const __dirname = path.dirname(__filename);
const BASE_URL = 'http://120.236.48.169:89/HPMS/'; const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'ProjectInfoList.aspx'; const START_URL = BASE_URL + 'ProjectInfoList.aspx';
// XML Encode
function xmlEncode(text) {
return text.replace(/&/g, '&amp;').replace(/</g, '&lt;');
}
async function getEncryptedUrl(relativeUrl, origin) {
if (!relativeUrl) {
return '';
}
const urlParts = relativeUrl.split('?');
if (urlParts.length < 2) {
return new URL(relativeUrl, origin).href;
}
const path = urlParts[0];
const queryString = urlParts[1];
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
try {
const response = await axios.post(encryptionUrl, xmlPayload, {
headers: {
'Content-Type': 'application/xml',
}
});
const encryptedQuery = response.data;
if (encryptedQuery) {
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
}
} catch (error) {
console.error(`加密链接失败: ${relativeUrl}`, error.message);
}
// Fallback to original url on error
return new URL(relativeUrl, origin).href;
}
// 从HTML中提取表格数据的函数 // 从HTML中提取表格数据的函数
async function extractDataFromHtml($, origin) { async function extractDataFromHtml($, origin) {
const rows = $('.resultlist table tr:has(td)').get(); const rows = $('.resultlist table tr:has(td)').get();
@ -270,15 +232,17 @@ export async function main() {
// 保存为JSON文件 // 保存为JSON文件
const dataPath = path.join(__dirname, '..', 'data', 'project.json'); const dataPath = path.join(__dirname, '..', 'data', 'project.json');
await fs.mkdir(path.dirname(dataPath), { recursive: true });
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8'); await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
console.log('项目数据已保存至 data/project.json 文件。'); console.log('项目数据已保存至 data/project.json 文件。');
console.log('\n所有数据抓取和处理任务已完成。'); console.log('\n所有数据抓取和处理任务已完成。');
} catch (error) { } catch (error) {
console.error('抓取或处理过程中发生错误:', error.message); console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message);
if (error.response) { if (error.response) {
console.error('Status:', error.response.status); console.error('Status:', error.response.status);
} }
throw error; // Re-throw the error to be caught by the caller
} }
} }