initial commit

This commit is contained in:
秦秋旭 2026-01-18 14:04:46 +08:00
commit 7a4449ccfb
5 changed files with 3413 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
node_modules
.pnpm-store
.DS_Store
debug.html

2870
data.json Normal file

File diff suppressed because it is too large Load Diff

18
package.json Normal file
View File

@ -0,0 +1,18 @@
{
"name": "puning-real-estate",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"packageManager": "pnpm@10.28.0",
"dependencies": {
"axios": "^1.13.2",
"cheerio": "^1.1.2"
},
"type": "module"
}

382
pnpm-lock.yaml Normal file
View File

@ -0,0 +1,382 @@
lockfileVersion: '9.0'
settings:
autoInstallPeers: true
excludeLinksFromLockfile: false
importers:
.:
dependencies:
axios:
specifier: ^1.13.2
version: 1.13.2
cheerio:
specifier: ^1.1.2
version: 1.1.2
packages:
asynckit@0.4.0:
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
axios@1.13.2:
resolution: {integrity: sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==}
boolbase@1.0.0:
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
call-bind-apply-helpers@1.0.2:
resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==}
engines: {node: '>= 0.4'}
cheerio-select@2.1.0:
resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==}
cheerio@1.1.2:
resolution: {integrity: sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==}
engines: {node: '>=20.18.1'}
combined-stream@1.0.8:
resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
engines: {node: '>= 0.8'}
css-select@5.2.2:
resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==}
css-what@6.2.2:
resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==}
engines: {node: '>= 6'}
delayed-stream@1.0.0:
resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
engines: {node: '>=0.4.0'}
dom-serializer@2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
domelementtype@2.3.0:
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
domhandler@5.0.3:
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
engines: {node: '>= 4'}
domutils@3.2.2:
resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==}
dunder-proto@1.0.1:
resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==}
engines: {node: '>= 0.4'}
encoding-sniffer@0.2.1:
resolution: {integrity: sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==}
entities@4.5.0:
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
engines: {node: '>=0.12'}
entities@6.0.1:
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
engines: {node: '>=0.12'}
es-define-property@1.0.1:
resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==}
engines: {node: '>= 0.4'}
es-errors@1.3.0:
resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==}
engines: {node: '>= 0.4'}
es-object-atoms@1.1.1:
resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==}
engines: {node: '>= 0.4'}
es-set-tostringtag@2.1.0:
resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==}
engines: {node: '>= 0.4'}
follow-redirects@1.15.11:
resolution: {integrity: sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==}
engines: {node: '>=4.0'}
peerDependencies:
debug: '*'
peerDependenciesMeta:
debug:
optional: true
form-data@4.0.5:
resolution: {integrity: sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==}
engines: {node: '>= 6'}
function-bind@1.1.2:
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
get-intrinsic@1.3.0:
resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==}
engines: {node: '>= 0.4'}
get-proto@1.0.1:
resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==}
engines: {node: '>= 0.4'}
gopd@1.2.0:
resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
engines: {node: '>= 0.4'}
has-symbols@1.1.0:
resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==}
engines: {node: '>= 0.4'}
has-tostringtag@1.0.2:
resolution: {integrity: sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==}
engines: {node: '>= 0.4'}
hasown@2.0.2:
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
engines: {node: '>= 0.4'}
htmlparser2@10.0.0:
resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==}
iconv-lite@0.6.3:
resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==}
engines: {node: '>=0.10.0'}
math-intrinsics@1.1.0:
resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
engines: {node: '>= 0.4'}
mime-db@1.52.0:
resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==}
engines: {node: '>= 0.6'}
mime-types@2.1.35:
resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==}
engines: {node: '>= 0.6'}
nth-check@2.1.1:
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
parse5-htmlparser2-tree-adapter@7.1.0:
resolution: {integrity: sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==}
parse5-parser-stream@7.1.2:
resolution: {integrity: sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==}
parse5@7.3.0:
resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==}
proxy-from-env@1.1.0:
resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==}
safer-buffer@2.1.2:
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
undici@7.18.2:
resolution: {integrity: sha512-y+8YjDFzWdQlSE9N5nzKMT3g4a5UBX1HKowfdXh0uvAnTaqqwqB92Jt4UXBAeKekDs5IaDKyJFR4X1gYVCgXcw==}
engines: {node: '>=20.18.1'}
whatwg-encoding@3.1.1:
resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==}
engines: {node: '>=18'}
whatwg-mimetype@4.0.0:
resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==}
engines: {node: '>=18'}
snapshots:
asynckit@0.4.0: {}
axios@1.13.2:
dependencies:
follow-redirects: 1.15.11
form-data: 4.0.5
proxy-from-env: 1.1.0
transitivePeerDependencies:
- debug
boolbase@1.0.0: {}
call-bind-apply-helpers@1.0.2:
dependencies:
es-errors: 1.3.0
function-bind: 1.1.2
cheerio-select@2.1.0:
dependencies:
boolbase: 1.0.0
css-select: 5.2.2
css-what: 6.2.2
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.2.2
cheerio@1.1.2:
dependencies:
cheerio-select: 2.1.0
dom-serializer: 2.0.0
domhandler: 5.0.3
domutils: 3.2.2
encoding-sniffer: 0.2.1
htmlparser2: 10.0.0
parse5: 7.3.0
parse5-htmlparser2-tree-adapter: 7.1.0
parse5-parser-stream: 7.1.2
undici: 7.18.2
whatwg-mimetype: 4.0.0
combined-stream@1.0.8:
dependencies:
delayed-stream: 1.0.0
css-select@5.2.2:
dependencies:
boolbase: 1.0.0
css-what: 6.2.2
domhandler: 5.0.3
domutils: 3.2.2
nth-check: 2.1.1
css-what@6.2.2: {}
delayed-stream@1.0.0: {}
dom-serializer@2.0.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
entities: 4.5.0
domelementtype@2.3.0: {}
domhandler@5.0.3:
dependencies:
domelementtype: 2.3.0
domutils@3.2.2:
dependencies:
dom-serializer: 2.0.0
domelementtype: 2.3.0
domhandler: 5.0.3
dunder-proto@1.0.1:
dependencies:
call-bind-apply-helpers: 1.0.2
es-errors: 1.3.0
gopd: 1.2.0
encoding-sniffer@0.2.1:
dependencies:
iconv-lite: 0.6.3
whatwg-encoding: 3.1.1
entities@4.5.0: {}
entities@6.0.1: {}
es-define-property@1.0.1: {}
es-errors@1.3.0: {}
es-object-atoms@1.1.1:
dependencies:
es-errors: 1.3.0
es-set-tostringtag@2.1.0:
dependencies:
es-errors: 1.3.0
get-intrinsic: 1.3.0
has-tostringtag: 1.0.2
hasown: 2.0.2
follow-redirects@1.15.11: {}
form-data@4.0.5:
dependencies:
asynckit: 0.4.0
combined-stream: 1.0.8
es-set-tostringtag: 2.1.0
hasown: 2.0.2
mime-types: 2.1.35
function-bind@1.1.2: {}
get-intrinsic@1.3.0:
dependencies:
call-bind-apply-helpers: 1.0.2
es-define-property: 1.0.1
es-errors: 1.3.0
es-object-atoms: 1.1.1
function-bind: 1.1.2
get-proto: 1.0.1
gopd: 1.2.0
has-symbols: 1.1.0
hasown: 2.0.2
math-intrinsics: 1.1.0
get-proto@1.0.1:
dependencies:
dunder-proto: 1.0.1
es-object-atoms: 1.1.1
gopd@1.2.0: {}
has-symbols@1.1.0: {}
has-tostringtag@1.0.2:
dependencies:
has-symbols: 1.1.0
hasown@2.0.2:
dependencies:
function-bind: 1.1.2
htmlparser2@10.0.0:
dependencies:
domelementtype: 2.3.0
domhandler: 5.0.3
domutils: 3.2.2
entities: 6.0.1
iconv-lite@0.6.3:
dependencies:
safer-buffer: 2.1.2
math-intrinsics@1.1.0: {}
mime-db@1.52.0: {}
mime-types@2.1.35:
dependencies:
mime-db: 1.52.0
nth-check@2.1.1:
dependencies:
boolbase: 1.0.0
parse5-htmlparser2-tree-adapter@7.1.0:
dependencies:
domhandler: 5.0.3
parse5: 7.3.0
parse5-parser-stream@7.1.2:
dependencies:
parse5: 7.3.0
parse5@7.3.0:
dependencies:
entities: 6.0.1
proxy-from-env@1.1.0: {}
safer-buffer@2.1.2: {}
undici@7.18.2: {}
whatwg-encoding@3.1.1:
dependencies:
iconv-lite: 0.6.3
whatwg-mimetype@4.0.0: {}

139
scraper.js Normal file
View File

@ -0,0 +1,139 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx';
// Extracts table data from a given HTML content
function extractDataFromHtml($) {
const data = [];
// Corrected selector to find rows with table data, skipping the header
const rows = $('.resultlist table tr:has(td)');
rows.each((i, row) => {
const columns = $(row).find('td');
// Based on debug.html, the structure is different and has 9 columns
if (columns.length >= 9) {
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
const licenseLinkTag = licenseCell.find('a');
const rowData = {
'序号': $(columns[0]).text().trim(),
'许可证号': licenseLinkTag.text().trim(),
'开发企业': $(columns[2]).text().trim(),
'项目名称': $(columns[3]).text().trim(),
'项目地址': $(columns[4]).text().trim(),
'批准时间': $(columns[5]).text().trim(),
'所在区域': $(columns[6]).text().trim(),
'总套数': $(columns[7]).text().trim(),
'可售套数': $(columns[8]).text().trim(),
'许可证链接': '', // Initialize
};
// The link is inside an onclick attribute, not a standard href
const onclickAttr = licenseLinkTag.attr('onclick');
if (onclickAttr) {
// Make regex flexible to handle single or double quotes
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
if (match && match[1]) {
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
// We need to resolve it against the origin, not the full BASE_URL path
const origin = new URL(BASE_URL).origin;
rowData['许可证链接'] = new URL(match[1], origin).href;
}
}
data.push(rowData);
}
});
return data;
}
async function main() {
try {
console.log('开始抓取第一页数据...');
let response = await axios.get(START_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
});
let $ = cheerio.load(response.data);
let allData = extractDataFromHtml($);
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
// Find the total number of pages from the correct element
const pageCountSpan = $('#PageNavigator1_LblPageCount');
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
console.log(`共发现 ${totalPages} 页。`);
// Get initial search form values to persist them across requests
const formValues = {};
$('input[name^="txt"], select').each((idx, el) => {
const name = $(el).attr('name');
if (name) {
formValues[name] = $(el).val() || '';
}
});
// Loop from the second page to the end
for (let i = 2; i <= totalPages; i++) {
console.log(`正在抓取第 ${i} 页...`);
// Get the required form fields from the CURRENT page's response
const viewState = $('#__VIEWSTATE').val();
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
const eventValidation = $('#__EVENTVALIDATION').val();
if (!viewState) {
console.log('无法找到 __VIEWSTATE终止抓取。');
break;
}
const postData = new URLSearchParams();
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
postData.append('__EVENTARGUMENT', '');
postData.append('__VIEWSTATE', viewState);
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
if(eventValidation) {
postData.append('__EVENTVALIDATION', eventValidation);
}
// Append the initial form values
for (const name in formValues) {
postData.append(name, formValues[name]);
}
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
response = await axios.post(START_URL, postData, {
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': START_URL,
}
});
$ = cheerio.load(response.data);
const nextPageData = extractDataFromHtml($);
console.log(`${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
if (nextPageData.length === 0) {
console.log(`${i} 页没有数据,抓取结束。`);
break;
}
allData = allData.concat(nextPageData);
}
await fs.writeFile('data.json', JSON.stringify(allData, null, 4), 'utf-8');
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条记录已保存至 data.json 文件。`);
} catch (error) {
console.error('抓取过程中发生错误:', error.message);
if (error.response) {
console.error('Status:', error.response.status);
}
}
}
main();