getEncryptedUrl
This commit is contained in:
parent
2b5a602b49
commit
8ff37476f7
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -347,7 +347,7 @@
|
|||||||
"楼幢": [
|
"楼幢": [
|
||||||
{
|
{
|
||||||
"楼幢名称": "2幢",
|
"楼幢名称": "2幢",
|
||||||
"成交均价": "6408.55",
|
"成交均价": "6356.11",
|
||||||
"bid": "1099666"
|
"bid": "1099666"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -582,7 +582,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"楼幢名称": "4幢",
|
"楼幢名称": "4幢",
|
||||||
"成交均价": "8522.14",
|
"成交均价": "8515.79",
|
||||||
"bid": "1099587"
|
"bid": "1099587"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -1942,7 +1942,7 @@
|
|||||||
"楼幢": [
|
"楼幢": [
|
||||||
{
|
{
|
||||||
"楼幢名称": "4幢",
|
"楼幢名称": "4幢",
|
||||||
"成交均价": "6328.95",
|
"成交均价": "6328.52",
|
||||||
"bid": "1099714"
|
"bid": "1099714"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -3097,7 +3097,7 @@
|
|||||||
"楼幢": [
|
"楼幢": [
|
||||||
{
|
{
|
||||||
"楼幢名称": "1幢",
|
"楼幢名称": "1幢",
|
||||||
"成交均价": "7517.44",
|
"成交均价": "7513.09",
|
||||||
"bid": "1099429"
|
"bid": "1099429"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -3274,7 +3274,7 @@
|
|||||||
"楼幢": [
|
"楼幢": [
|
||||||
{
|
{
|
||||||
"楼幢名称": "7幢",
|
"楼幢名称": "7幢",
|
||||||
"成交均价": "7070.37",
|
"成交均价": "7074.62",
|
||||||
"bid": "1099386"
|
"bid": "1099386"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
47
scripts/getEncryptedUrl.js
Normal file
47
scripts/getEncryptedUrl.js
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import axios from './axios.js';
|
||||||
|
|
||||||
|
// XML Encode
|
||||||
|
function xmlEncode(text) {
|
||||||
|
if (typeof text !== 'string') return '';
|
||||||
|
return text.replace(/&/g, '&').replace(/</g, '<');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a relative URL from the website and returns a fully formed, encrypted URL.
|
||||||
|
* @param {string} relativeUrl - The relative URL like 'ProjectDetailsInfo.aspx?a=1&b=2'.
|
||||||
|
* @param {string} origin - The origin of the website, e.g., 'http://120.236.48.169:89'.
|
||||||
|
* @returns {Promise<string>} - The full, encrypted URL.
|
||||||
|
*/
|
||||||
|
export async function getEncryptedUrl(relativeUrl, origin) {
|
||||||
|
if (!relativeUrl) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
const urlParts = relativeUrl.split('?');
|
||||||
|
if (urlParts.length < 2) {
|
||||||
|
return new URL(relativeUrl, origin).href;
|
||||||
|
}
|
||||||
|
|
||||||
|
const path = urlParts[0];
|
||||||
|
const queryString = urlParts[1];
|
||||||
|
|
||||||
|
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
|
||||||
|
|
||||||
|
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.post(encryptionUrl, xmlPayload, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/xml',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
const encryptedQuery = response.data;
|
||||||
|
if (encryptedQuery) {
|
||||||
|
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`加密链接失败: ${relativeUrl}`, error.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to original url on error
|
||||||
|
return new URL(relativeUrl, origin).href;
|
||||||
|
}
|
||||||
@ -2,20 +2,20 @@ import axios from './axios.js';
|
|||||||
import * as cheerio from 'cheerio';
|
import * as cheerio from 'cheerio';
|
||||||
import fs from 'fs/promises';
|
import fs from 'fs/promises';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
|
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||||
|
|
||||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||||||
|
|
||||||
// Extracts table data from a given HTML content
|
// Extracts table data from a given HTML content
|
||||||
function extractDataFromHtml($) {
|
async function extractDataFromHtml($, origin) {
|
||||||
const data = [];
|
const dataPromises = $('.resultlist table tr:has(td)').get().map(async (row) => {
|
||||||
// Corrected selector to find rows with table data, skipping the header
|
|
||||||
const rows = $('.resultlist table tr:has(td)');
|
|
||||||
|
|
||||||
rows.each((i, row) => {
|
|
||||||
const columns = $(row).find('td');
|
const columns = $(row).find('td');
|
||||||
// Based on debug.html, the structure is different and has 9 columns
|
// Based on debug.html, the structure is different and has 9 columns
|
||||||
if (columns.length >= 9) {
|
if (columns.length < 9) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
||||||
const licenseLinkTag = licenseCell.find('a');
|
const licenseLinkTag = licenseCell.find('a');
|
||||||
|
|
||||||
@ -38,16 +38,14 @@ function extractDataFromHtml($) {
|
|||||||
// Make regex flexible to handle single or double quotes
|
// Make regex flexible to handle single or double quotes
|
||||||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||||||
if (match && match[1]) {
|
if (match && match[1]) {
|
||||||
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
|
rowData['许可证链接'] = await getEncryptedUrl(match[1], origin);
|
||||||
// We need to resolve it against the origin, not the full BASE_URL path
|
|
||||||
const origin = new URL(BASE_URL).origin;
|
|
||||||
rowData['许可证链接'] = new URL(match[1], origin).href;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
data.push(rowData);
|
return rowData;
|
||||||
}
|
|
||||||
});
|
});
|
||||||
return data;
|
|
||||||
|
const data = await Promise.all(dataPromises);
|
||||||
|
return data.filter(item => item !== null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -56,11 +54,13 @@ function extractDataFromHtml($) {
|
|||||||
*/
|
*/
|
||||||
async function scrapeWebsite() {
|
async function scrapeWebsite() {
|
||||||
let allData = [];
|
let allData = [];
|
||||||
|
const origin = new URL(BASE_URL).origin;
|
||||||
|
|
||||||
console.log('开始抓取第一页数据...');
|
console.log('开始抓取第一页数据...');
|
||||||
let response = await axios.get(START_URL);
|
let response = await axios.get(START_URL);
|
||||||
|
|
||||||
let $ = cheerio.load(response.data);
|
let $ = cheerio.load(response.data);
|
||||||
allData = extractDataFromHtml($);
|
allData = await extractDataFromHtml($, origin);
|
||||||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||||||
|
|
||||||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||||||
@ -109,7 +109,7 @@ async function scrapeWebsite() {
|
|||||||
});
|
});
|
||||||
|
|
||||||
$ = cheerio.load(response.data);
|
$ = cheerio.load(response.data);
|
||||||
const nextPageData = extractDataFromHtml($);
|
const nextPageData = await extractDataFromHtml($, origin);
|
||||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||||
|
|
||||||
if (nextPageData.length === 0) {
|
if (nextPageData.length === 0) {
|
||||||
@ -160,15 +160,17 @@ export async function main() {
|
|||||||
const processedData = await processScrapedData(allData);
|
const processedData = await processScrapedData(allData);
|
||||||
|
|
||||||
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
|
||||||
|
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||||
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);
|
||||||
|
|
||||||
console.log('\n所有数据处理和文件生成任务已完成。');
|
console.log('\n所有数据处理和文件生成任务已完成。');
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('抓取或处理过程中发生错误:', error.message);
|
console.error('在 getPreSaleLicense.js 抓取或处理过程中发生错误:', error.message);
|
||||||
if (error.response) {
|
if (error.response) {
|
||||||
console.error('Status:', error.response.status);
|
console.error('Status:', error.response.status);
|
||||||
}
|
}
|
||||||
|
throw error; // Re-throw the error to be caught by the caller
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,52 +4,14 @@ import fs from 'fs/promises';
|
|||||||
import path from 'path';
|
import path from 'path';
|
||||||
import { fileURLToPath } from 'url';
|
import { fileURLToPath } from 'url';
|
||||||
|
|
||||||
|
import { getEncryptedUrl } from './getEncryptedUrl.js';
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
const __dirname = path.dirname(__filename);
|
const __dirname = path.dirname(__filename);
|
||||||
|
|
||||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||||
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
|
const START_URL = BASE_URL + 'ProjectInfoList.aspx';
|
||||||
|
|
||||||
// XML Encode
|
|
||||||
function xmlEncode(text) {
|
|
||||||
return text.replace(/&/g, '&').replace(/</g, '<');
|
|
||||||
}
|
|
||||||
|
|
||||||
async function getEncryptedUrl(relativeUrl, origin) {
|
|
||||||
if (!relativeUrl) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
const urlParts = relativeUrl.split('?');
|
|
||||||
if (urlParts.length < 2) {
|
|
||||||
return new URL(relativeUrl, origin).href;
|
|
||||||
}
|
|
||||||
|
|
||||||
const path = urlParts[0];
|
|
||||||
const queryString = urlParts[1];
|
|
||||||
|
|
||||||
const xmlPayload = `<?xml version="1.0" encoding="utf-8" standalone="yes"?> <param funname="SouthDigital.BasicFun.MyEncrypt.DefaultEncrypt"> <item>${xmlEncode(queryString)}</item> </param>`;
|
|
||||||
|
|
||||||
const encryptionUrl = new URL('/Common/Agents/ExeFunCommon.aspx', origin).href;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const response = await axios.post(encryptionUrl, xmlPayload, {
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/xml',
|
|
||||||
}
|
|
||||||
});
|
|
||||||
const encryptedQuery = response.data;
|
|
||||||
if (encryptedQuery) {
|
|
||||||
return `${new URL(path, origin).href}?${encodeURIComponent(encryptedQuery)}`;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`加密链接失败: ${relativeUrl}`, error.message);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback to original url on error
|
|
||||||
return new URL(relativeUrl, origin).href;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// 从HTML中提取表格数据的函数
|
// 从HTML中提取表格数据的函数
|
||||||
async function extractDataFromHtml($, origin) {
|
async function extractDataFromHtml($, origin) {
|
||||||
const rows = $('.resultlist table tr:has(td)').get();
|
const rows = $('.resultlist table tr:has(td)').get();
|
||||||
@ -270,15 +232,17 @@ export async function main() {
|
|||||||
|
|
||||||
// 保存为JSON文件
|
// 保存为JSON文件
|
||||||
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
const dataPath = path.join(__dirname, '..', 'data', 'project.json');
|
||||||
|
await fs.mkdir(path.dirname(dataPath), { recursive: true });
|
||||||
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
|
||||||
console.log('项目数据已保存至 data/project.json 文件。');
|
console.log('项目数据已保存至 data/project.json 文件。');
|
||||||
|
|
||||||
console.log('\n所有数据抓取和处理任务已完成。');
|
console.log('\n所有数据抓取和处理任务已完成。');
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('抓取或处理过程中发生错误:', error.message);
|
console.error('在 getProject.js 抓取或处理过程中发生错误:', error.message);
|
||||||
if (error.response) {
|
if (error.response) {
|
||||||
console.error('Status:', error.response.status);
|
console.error('Status:', error.response.status);
|
||||||
}
|
}
|
||||||
|
throw error; // Re-throw the error to be caught by the caller
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue
Block a user