getEncryptedUrl
This commit is contained in:
parent
2b5a602b49
commit
8ff37476f7
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -347,7 +347,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "2幢",
|
||||
"成交均价": "6408.55",
|
||||
"成交均价": "6356.11",
|
||||
"bid": "1099666"
|
||||
},
|
||||
{
|
||||
@ -582,7 +582,7 @@
|
||||
},
|
||||
{
|
||||
"楼幢名称": "4幢",
|
||||
"成交均价": "8522.14",
|
||||
"成交均价": "8515.79",
|
||||
"bid": "1099587"
|
||||
}
|
||||
]
|
||||
@ -1942,7 +1942,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "4幢",
|
||||
"成交均价": "6328.95",
|
||||
"成交均价": "6328.52",
|
||||
"bid": "1099714"
|
||||
}
|
||||
]
|
||||
@ -3097,7 +3097,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "1幢",
|
||||
"成交均价": "7517.44",
|
||||
"成交均价": "7513.09",
|
||||
"bid": "1099429"
|
||||
},
|
||||
{
|
||||
@ -3274,7 +3274,7 @@
|
||||
"楼幢": [
|
||||
{
|
||||
"楼幢名称": "7幢",
|
||||
"成交均价": "7070.37",
|
||||
"成交均价": "7074.62",
|
||||
"bid": "1099386"
|
||||
}
|
||||
]
|
||||
|
||||
47
scripts/getEncryptedUrl.js
Normal file
47
scripts/getEncryptedUrl.js
Normal file
@ -0,0 +1,47 @@
|
||||
import axios from './axios.js';
|
||||
|
||||
// XML Encode
|
||||
function xmlEncode(text) {
|
||||
if (typeof text !== 'string') return '';
|
||||
return text.replace(/&/g, '&').replace(/</g, '<');
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a relative URL from the website and returns a fully formed, encrypted URL.
|
||||
* @param {string} relativeUrl - The relative URL like 'ProjectDetailsInfo.aspx?a=1&b=2'.
|
||||
* @param {string} origin - The origin of the website, e.g., 'http://120.236.48.169:89'.
|
||||
* @returns {Promise<string>} - The full, encrypted URL.
|
||||
*/
|
||||
export async function getEncryptedUrl(relativeUrl, origin) {
|
||||
if (!relativeUrl) {
|
||||
return '';
|
||||
}
|
||||
const urlParts = relativeUrl.split('?');
|
||||
if (urlParts.length < 2) {
|
||||
return new URL(relativeUrl, origin).href;
|
||||
}
|
||||
|
||||
const path = urlParts[0];
|
||||
const queryString = urlParts[1];
|
||||
|
||||
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
|
||||
|
||||
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
|
||||
|
||||
try {
|
||||
const response = await axios.post(encryptionUrl, xmlPayload, {
|
||||
headers: {
|
||||
'Content-Type': 'application/xml',
|
||||
}
|
||||
});
|
||||
const encryptedQuery = response.data;
|
||||
if (encryptedQuery) {
|
||||
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`加密链接失败: ${relativeUrl}`, error.message);
|
||||
}
|
||||
|
||||
// Fallback to original url on error
|
||||
return new URL(relativeUrl, origin).href;
|
||||
}
|
||||
@ -2,52 +2,50 @@ import axios from './axios.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||||
|
||||
// Extracts table data from a given HTML content
|
||||
function extractDataFromHtml($) {
|
||||
const data = [];
|
||||
// Corrected selector to find rows with table data, skipping the header
|
||||
const rows = $('.resultlist table tr:has(td)');
|
||||
|
||||
rows.each((i, row) => {
|
||||
async function extractDataFromHtml($, origin) {
|
||||
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
|
||||
const columns = $(row).find('td');
|
||||
// Based on debug.html, the structure is different and has 9 columns
|
||||
if (columns.length >= 9) {
|
||||
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
||||
const licenseLinkTag = licenseCell.find('a');
|
||||
|
||||
const rowData = {
|
||||
'序号': $(columns[0]).text().trim(),
|
||||
'许可证号': licenseLinkTag.text().trim(),
|
||||
'开发企业': $(columns[2]).text().trim(),
|
||||
'项目名称': $(columns[3]).text().trim(),
|
||||
'项目地址': $(columns[4]).text().trim(),
|
||||
'批准时间': $(columns[5]).text().trim(),
|
||||
'所在区域': $(columns[6]).text().trim(),
|
||||
'总套数': $(columns[7]).text().trim(),
|
||||
'可售套数': $(columns[8]).text().trim(),
|
||||
'许可证链接': '', // Initialize
|
||||
};
|
||||
|
||||
// The link is inside an onclick attribute, not a standard href
|
||||
const onclickAttr = licenseLinkTag.attr('onclick');
|
||||
if (onclickAttr) {
|
||||
// Make regex flexible to handle single or double quotes
|
||||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||||
if (match && match[1]) {
|
||||
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
|
||||
// We need to resolve it against the origin, not the full BASE_URL path
|
||||
const origin = new URL(BASE_URL).origin;
|
||||
rowData['许可证链接'] = new URL(match[1], origin).href;
|
||||
}
|
||||
}
|
||||
data.push(rowData);
|
||||
if (columns.length < 9) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
||||
const licenseLinkTag = licenseCell.find('a');
|
||||
|
||||
const rowData = {
|
||||
'序号': $(columns[0]).text().trim(),
|
||||
'许可证号': licenseLinkTag.text().trim(),
|
||||
'开发企业': $(columns[2]).text().trim(),
|
||||
'项目名称': $(columns[3]).text().trim(),
|
||||
'项目地址': $(columns[4]).text().trim(),
|
||||
'批准时间': $(columns[5]).text().trim(),
|
||||
'所在区域': $(columns[6]).text().trim(),
|
||||
'总套数': $(columns[7]).text().trim(),
|
||||
'可售套数': $(columns[8]).text().trim(),
|
||||
'许可证链接': '', // Initialize
|
||||
};
|
||||
|
||||
// The link is inside an onclick attribute, not a standard href
|
||||
const onclickAttr = licenseLinkTag.attr('onclick');
|
||||
if (onclickAttr) {
|
||||
// Make regex flexible to handle single or double quotes
|
||||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||||
if (match && match[1]) {
|
||||
rowData['许可证链接'] = await getEncryptedUrl(match[1], origin);
|
||||
}
|
||||
}
|
||||
return rowData;
|
||||
});
|
||||
return data;
|
||||
|
||||
const data = await Promise.all(dataPromises);
|
||||
return data.filter(item => item !== null);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -56,11 +54,13 @@ function extractDataFromHtml($) {
|
||||
*/
|
||||
async function scrapeWebsite() {
|
||||
let allData = [];
|
||||
const origin = new URL(BASE_URL).origin;
|
||||
|
||||
console.log('开始抓取第一页数据...');
|
||||
let response = await axios.get(START_URL);
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
allData = extractDataFromHtml($);
|
||||
allData = await extractDataFromHtml($, origin);
|
||||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||||
|
||||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||||
@ -109,7 +109,7 @@ async function scrapeWebsite() {
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = extractDataFromHtml($);
|
||||
const nextPageData = await extractDataFromHtml($, origin);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
@ -160,15 +160,17 @@ export async function main() {
|
||||
const processedData = await processScrapedData(allData);
|
||||
|
||||
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
||||
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
||||
|
||||
console.log('\n所有数据处理和文件生成任务已完成。');
|
||||
|
||||
} catch (error) {
|
||||
console.error('抓取或处理过程中发生错误:', error.message);
|
||||
console.error('在 getPreSaleLicense.js 抓取或处理过程中发生错误:', error.message);
|
||||
if (error.response) {
|
||||
console.error('Status:', error.response.status);
|
||||
}
|
||||
throw error; // Re-throw the error to be caught by the caller
|
||||
}
|
||||
}
|
||||
|
||||
@ -4,52 +4,14 @@ import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
|
||||
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
|
||||
|
||||
// XML Encode
|
||||
function xmlEncode(text) {
|
||||
return text.replace(/&/g, '&').replace(/</g, '<');
|
||||
}
|
||||
|
||||
async function getEncryptedUrl(relativeUrl, origin) {
|
||||
if (!relativeUrl) {
|
||||
return '';
|
||||
}
|
||||
const urlParts = relativeUrl.split('?');
|
||||
if (urlParts.length < 2) {
|
||||
return new URL(relativeUrl, origin).href;
|
||||
}
|
||||
|
||||
const path = urlParts[0];
|
||||
const queryString = urlParts[1];
|
||||
|
||||
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
|
||||
|
||||
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
|
||||
|
||||
try {
|
||||
const response = await axios.post(encryptionUrl, xmlPayload, {
|
||||
headers: {
|
||||
'Content-Type': 'application/xml',
|
||||
}
|
||||
});
|
||||
const encryptedQuery = response.data;
|
||||
if (encryptedQuery) {
|
||||
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`加密链接失败: ${relativeUrl}`, error.message);
|
||||
}
|
||||
|
||||
// Fallback to original url on error
|
||||
return new URL(relativeUrl, origin).href;
|
||||
}
|
||||
|
||||
|
||||
// 从HTML中提取表格数据的函数
|
||||
async function extractDataFromHtml($, origin) {
|
||||
const rows = $('.resultlist table tr:has(td)').get();
|
||||
@ -270,15 +232,17 @@ export async function main() {
|
||||
|
||||
// 保存为JSON文件
|
||||
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
||||
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||
console.log('项目数据已保存至 data/project.json 文件。');
|
||||
|
||||
console.log('\n所有数据抓取和处理任务已完成。');
|
||||
|
||||
} catch (error) {
|
||||
console.error('抓取或处理过程中发生错误:', error.message);
|
||||
console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message);
|
||||
if (error.response) {
|
||||
console.error('Status:', error.response.status);
|
||||
}
|
||||
throw error; // Re-throw the error to be caught by the caller
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user