initial commit
This commit is contained in:
commit
7a4449ccfb
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
node_modules
|
||||
.pnpm-store
|
||||
.DS_Store
|
||||
debug.html
|
||||
18
package.json
Normal file
18
package.json
Normal file
@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "puning-real-estate",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"packageManager": "pnpm@10.28.0",
|
||||
"dependencies": {
|
||||
"axios": "^1.13.2",
|
||||
"cheerio": "^1.1.2"
|
||||
},
|
||||
"type": "module"
|
||||
}
|
||||
382
pnpm-lock.yaml
Normal file
382
pnpm-lock.yaml
Normal file
@ -0,0 +1,382 @@
|
||||
lockfileVersion: '9.0'
|
||||
|
||||
settings:
|
||||
autoInstallPeers: true
|
||||
excludeLinksFromLockfile: false
|
||||
|
||||
importers:
|
||||
|
||||
.:
|
||||
dependencies:
|
||||
axios:
|
||||
specifier: ^1.13.2
|
||||
version: 1.13.2
|
||||
cheerio:
|
||||
specifier: ^1.1.2
|
||||
version: 1.1.2
|
||||
|
||||
packages:
|
||||
|
||||
asynckit@0.4.0:
|
||||
resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==}
|
||||
|
||||
axios@1.13.2:
|
||||
resolution: {integrity: sha512-VPk9ebNqPcy5lRGuSlKx752IlDatOjT9paPlm8A7yOuW2Fbvp4X3JznJtT4f0GzGLLiWE9W8onz51SqLYwzGaA==}
|
||||
|
||||
boolbase@1.0.0:
|
||||
resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==}
|
||||
|
||||
call-bind-apply-helpers@1.0.2:
|
||||
resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
cheerio-select@2.1.0:
|
||||
resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==}
|
||||
|
||||
cheerio@1.1.2:
|
||||
resolution: {integrity: sha512-IkxPpb5rS/d1IiLbHMgfPuS0FgiWTtFIm/Nj+2woXDLTZ7fOT2eqzgYbdMlLweqlHbsZjxEChoVK+7iph7jyQg==}
|
||||
engines: {node: '>=20.18.1'}
|
||||
|
||||
combined-stream@1.0.8:
|
||||
resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
|
||||
engines: {node: '>= 0.8'}
|
||||
|
||||
css-select@5.2.2:
|
||||
resolution: {integrity: sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==}
|
||||
|
||||
css-what@6.2.2:
|
||||
resolution: {integrity: sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==}
|
||||
engines: {node: '>= 6'}
|
||||
|
||||
delayed-stream@1.0.0:
|
||||
resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
|
||||
engines: {node: '>=0.4.0'}
|
||||
|
||||
dom-serializer@2.0.0:
|
||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||
|
||||
domelementtype@2.3.0:
|
||||
resolution: {integrity: sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==}
|
||||
|
||||
domhandler@5.0.3:
|
||||
resolution: {integrity: sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==}
|
||||
engines: {node: '>= 4'}
|
||||
|
||||
domutils@3.2.2:
|
||||
resolution: {integrity: sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==}
|
||||
|
||||
dunder-proto@1.0.1:
|
||||
resolution: {integrity: sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
encoding-sniffer@0.2.1:
|
||||
resolution: {integrity: sha512-5gvq20T6vfpekVtqrYQsSCFZ1wEg5+wW0/QaZMWkFr6BqD3NfKs0rLCx4rrVlSWJeZb5NBJgVLswK/w2MWU+Gw==}
|
||||
|
||||
entities@4.5.0:
|
||||
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
entities@6.0.1:
|
||||
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
es-define-property@1.0.1:
|
||||
resolution: {integrity: sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
es-errors@1.3.0:
|
||||
resolution: {integrity: sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
es-object-atoms@1.1.1:
|
||||
resolution: {integrity: sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
es-set-tostringtag@2.1.0:
|
||||
resolution: {integrity: sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
follow-redirects@1.15.11:
|
||||
resolution: {integrity: sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==}
|
||||
engines: {node: '>=4.0'}
|
||||
peerDependencies:
|
||||
debug: '*'
|
||||
peerDependenciesMeta:
|
||||
debug:
|
||||
optional: true
|
||||
|
||||
form-data@4.0.5:
|
||||
resolution: {integrity: sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==}
|
||||
engines: {node: '>= 6'}
|
||||
|
||||
function-bind@1.1.2:
|
||||
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
|
||||
|
||||
get-intrinsic@1.3.0:
|
||||
resolution: {integrity: sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
get-proto@1.0.1:
|
||||
resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
gopd@1.2.0:
|
||||
resolution: {integrity: sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
has-symbols@1.1.0:
|
||||
resolution: {integrity: sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
has-tostringtag@1.0.2:
|
||||
resolution: {integrity: sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
hasown@2.0.2:
|
||||
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
htmlparser2@10.0.0:
|
||||
resolution: {integrity: sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g==}
|
||||
|
||||
iconv-lite@0.6.3:
|
||||
resolution: {integrity: sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==}
|
||||
engines: {node: '>=0.10.0'}
|
||||
|
||||
math-intrinsics@1.1.0:
|
||||
resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
mime-db@1.52.0:
|
||||
resolution: {integrity: sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==}
|
||||
engines: {node: '>= 0.6'}
|
||||
|
||||
mime-types@2.1.35:
|
||||
resolution: {integrity: sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==}
|
||||
engines: {node: '>= 0.6'}
|
||||
|
||||
nth-check@2.1.1:
|
||||
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
|
||||
|
||||
parse5-htmlparser2-tree-adapter@7.1.0:
|
||||
resolution: {integrity: sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==}
|
||||
|
||||
parse5-parser-stream@7.1.2:
|
||||
resolution: {integrity: sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==}
|
||||
|
||||
parse5@7.3.0:
|
||||
resolution: {integrity: sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==}
|
||||
|
||||
proxy-from-env@1.1.0:
|
||||
resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==}
|
||||
|
||||
safer-buffer@2.1.2:
|
||||
resolution: {integrity: sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==}
|
||||
|
||||
undici@7.18.2:
|
||||
resolution: {integrity: sha512-y+8YjDFzWdQlSE9N5nzKMT3g4a5UBX1HKowfdXh0uvAnTaqqwqB92Jt4UXBAeKekDs5IaDKyJFR4X1gYVCgXcw==}
|
||||
engines: {node: '>=20.18.1'}
|
||||
|
||||
whatwg-encoding@3.1.1:
|
||||
resolution: {integrity: sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
whatwg-mimetype@4.0.0:
|
||||
resolution: {integrity: sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
snapshots:
|
||||
|
||||
asynckit@0.4.0: {}
|
||||
|
||||
axios@1.13.2:
|
||||
dependencies:
|
||||
follow-redirects: 1.15.11
|
||||
form-data: 4.0.5
|
||||
proxy-from-env: 1.1.0
|
||||
transitivePeerDependencies:
|
||||
- debug
|
||||
|
||||
boolbase@1.0.0: {}
|
||||
|
||||
call-bind-apply-helpers@1.0.2:
|
||||
dependencies:
|
||||
es-errors: 1.3.0
|
||||
function-bind: 1.1.2
|
||||
|
||||
cheerio-select@2.1.0:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
css-select: 5.2.2
|
||||
css-what: 6.2.2
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
|
||||
cheerio@1.1.2:
|
||||
dependencies:
|
||||
cheerio-select: 2.1.0
|
||||
dom-serializer: 2.0.0
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
encoding-sniffer: 0.2.1
|
||||
htmlparser2: 10.0.0
|
||||
parse5: 7.3.0
|
||||
parse5-htmlparser2-tree-adapter: 7.1.0
|
||||
parse5-parser-stream: 7.1.2
|
||||
undici: 7.18.2
|
||||
whatwg-mimetype: 4.0.0
|
||||
|
||||
combined-stream@1.0.8:
|
||||
dependencies:
|
||||
delayed-stream: 1.0.0
|
||||
|
||||
css-select@5.2.2:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
css-what: 6.2.2
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
nth-check: 2.1.1
|
||||
|
||||
css-what@6.2.2: {}
|
||||
|
||||
delayed-stream@1.0.0: {}
|
||||
|
||||
dom-serializer@2.0.0:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
entities: 4.5.0
|
||||
|
||||
domelementtype@2.3.0: {}
|
||||
|
||||
domhandler@5.0.3:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
|
||||
domutils@3.2.2:
|
||||
dependencies:
|
||||
dom-serializer: 2.0.0
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
|
||||
dunder-proto@1.0.1:
|
||||
dependencies:
|
||||
call-bind-apply-helpers: 1.0.2
|
||||
es-errors: 1.3.0
|
||||
gopd: 1.2.0
|
||||
|
||||
encoding-sniffer@0.2.1:
|
||||
dependencies:
|
||||
iconv-lite: 0.6.3
|
||||
whatwg-encoding: 3.1.1
|
||||
|
||||
entities@4.5.0: {}
|
||||
|
||||
entities@6.0.1: {}
|
||||
|
||||
es-define-property@1.0.1: {}
|
||||
|
||||
es-errors@1.3.0: {}
|
||||
|
||||
es-object-atoms@1.1.1:
|
||||
dependencies:
|
||||
es-errors: 1.3.0
|
||||
|
||||
es-set-tostringtag@2.1.0:
|
||||
dependencies:
|
||||
es-errors: 1.3.0
|
||||
get-intrinsic: 1.3.0
|
||||
has-tostringtag: 1.0.2
|
||||
hasown: 2.0.2
|
||||
|
||||
follow-redirects@1.15.11: {}
|
||||
|
||||
form-data@4.0.5:
|
||||
dependencies:
|
||||
asynckit: 0.4.0
|
||||
combined-stream: 1.0.8
|
||||
es-set-tostringtag: 2.1.0
|
||||
hasown: 2.0.2
|
||||
mime-types: 2.1.35
|
||||
|
||||
function-bind@1.1.2: {}
|
||||
|
||||
get-intrinsic@1.3.0:
|
||||
dependencies:
|
||||
call-bind-apply-helpers: 1.0.2
|
||||
es-define-property: 1.0.1
|
||||
es-errors: 1.3.0
|
||||
es-object-atoms: 1.1.1
|
||||
function-bind: 1.1.2
|
||||
get-proto: 1.0.1
|
||||
gopd: 1.2.0
|
||||
has-symbols: 1.1.0
|
||||
hasown: 2.0.2
|
||||
math-intrinsics: 1.1.0
|
||||
|
||||
get-proto@1.0.1:
|
||||
dependencies:
|
||||
dunder-proto: 1.0.1
|
||||
es-object-atoms: 1.1.1
|
||||
|
||||
gopd@1.2.0: {}
|
||||
|
||||
has-symbols@1.1.0: {}
|
||||
|
||||
has-tostringtag@1.0.2:
|
||||
dependencies:
|
||||
has-symbols: 1.1.0
|
||||
|
||||
hasown@2.0.2:
|
||||
dependencies:
|
||||
function-bind: 1.1.2
|
||||
|
||||
htmlparser2@10.0.0:
|
||||
dependencies:
|
||||
domelementtype: 2.3.0
|
||||
domhandler: 5.0.3
|
||||
domutils: 3.2.2
|
||||
entities: 6.0.1
|
||||
|
||||
iconv-lite@0.6.3:
|
||||
dependencies:
|
||||
safer-buffer: 2.1.2
|
||||
|
||||
math-intrinsics@1.1.0: {}
|
||||
|
||||
mime-db@1.52.0: {}
|
||||
|
||||
mime-types@2.1.35:
|
||||
dependencies:
|
||||
mime-db: 1.52.0
|
||||
|
||||
nth-check@2.1.1:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
|
||||
parse5-htmlparser2-tree-adapter@7.1.0:
|
||||
dependencies:
|
||||
domhandler: 5.0.3
|
||||
parse5: 7.3.0
|
||||
|
||||
parse5-parser-stream@7.1.2:
|
||||
dependencies:
|
||||
parse5: 7.3.0
|
||||
|
||||
parse5@7.3.0:
|
||||
dependencies:
|
||||
entities: 6.0.1
|
||||
|
||||
proxy-from-env@1.1.0: {}
|
||||
|
||||
safer-buffer@2.1.2: {}
|
||||
|
||||
undici@7.18.2: {}
|
||||
|
||||
whatwg-encoding@3.1.1:
|
||||
dependencies:
|
||||
iconv-lite: 0.6.3
|
||||
|
||||
whatwg-mimetype@4.0.0: {}
|
||||
139
scraper.js
Normal file
139
scraper.js
Normal file
@ -0,0 +1,139 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import fs from 'fs/promises';
|
||||
|
||||
const BASE_URL = 'http://120.236.48.169:89/HPMS/';
|
||||
const START_URL = BASE_URL + 'presellCertList.aspx';
|
||||
|
||||
// Extracts table data from a given HTML content
|
||||
function extractDataFromHtml($) {
|
||||
const data = [];
|
||||
// Corrected selector to find rows with table data, skipping the header
|
||||
const rows = $('.resultlist table tr:has(td)');
|
||||
|
||||
rows.each((i, row) => {
|
||||
const columns = $(row).find('td');
|
||||
// Based on debug.html, the structure is different and has 9 columns
|
||||
if (columns.length >= 9) {
|
||||
const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
|
||||
const licenseLinkTag = licenseCell.find('a');
|
||||
|
||||
const rowData = {
|
||||
'序号': $(columns[0]).text().trim(),
|
||||
'许可证号': licenseLinkTag.text().trim(),
|
||||
'开发企业': $(columns[2]).text().trim(),
|
||||
'项目名称': $(columns[3]).text().trim(),
|
||||
'项目地址': $(columns[4]).text().trim(),
|
||||
'批准时间': $(columns[5]).text().trim(),
|
||||
'所在区域': $(columns[6]).text().trim(),
|
||||
'总套数': $(columns[7]).text().trim(),
|
||||
'可售套数': $(columns[8]).text().trim(),
|
||||
'许可证链接': '', // Initialize
|
||||
};
|
||||
|
||||
// The link is inside an onclick attribute, not a standard href
|
||||
const onclickAttr = licenseLinkTag.attr('onclick');
|
||||
if (onclickAttr) {
|
||||
// Make regex flexible to handle single or double quotes
|
||||
const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
|
||||
if (match && match[1]) {
|
||||
// match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
|
||||
// We need to resolve it against the origin, not the full BASE_URL path
|
||||
const origin = new URL(BASE_URL).origin;
|
||||
rowData['许可证链接'] = new URL(match[1], origin).href;
|
||||
}
|
||||
}
|
||||
data.push(rowData);
|
||||
}
|
||||
});
|
||||
return data;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
console.log('开始抓取第一页数据...');
|
||||
let response = await axios.get(START_URL, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
});
|
||||
|
||||
let $ = cheerio.load(response.data);
|
||||
let allData = extractDataFromHtml($);
|
||||
console.log(`第一页抓取完成,获得 ${allData.length} 条数据。`);
|
||||
|
||||
// Find the total number of pages from the correct element
|
||||
const pageCountSpan = $('#PageNavigator1_LblPageCount');
|
||||
const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
|
||||
console.log(`共发现 ${totalPages} 页。`);
|
||||
|
||||
// Get initial search form values to persist them across requests
|
||||
const formValues = {};
|
||||
$('input[name^="txt"], select').each((idx, el) => {
|
||||
const name = $(el).attr('name');
|
||||
if (name) {
|
||||
formValues[name] = $(el).val() || '';
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// Loop from the second page to the end
|
||||
for (let i = 2; i <= totalPages; i++) {
|
||||
console.log(`正在抓取第 ${i} 页...`);
|
||||
|
||||
// Get the required form fields from the CURRENT page's response
|
||||
const viewState = $('#__VIEWSTATE').val();
|
||||
const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
|
||||
const eventValidation = $('#__EVENTVALIDATION').val();
|
||||
|
||||
if (!viewState) {
|
||||
console.log('无法找到 __VIEWSTATE,终止抓取。');
|
||||
break;
|
||||
}
|
||||
|
||||
const postData = new URLSearchParams();
|
||||
postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
|
||||
postData.append('__EVENTARGUMENT', '');
|
||||
postData.append('__VIEWSTATE', viewState);
|
||||
postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
|
||||
if(eventValidation) {
|
||||
postData.append('__EVENTVALIDATION', eventValidation);
|
||||
}
|
||||
|
||||
// Append the initial form values
|
||||
for (const name in formValues) {
|
||||
postData.append(name, formValues[name]);
|
||||
}
|
||||
postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());
|
||||
|
||||
response = await axios.post(START_URL, postData, {
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'Referer': START_URL,
|
||||
}
|
||||
});
|
||||
|
||||
$ = cheerio.load(response.data);
|
||||
const nextPageData = extractDataFromHtml($);
|
||||
console.log(`第 ${i} 页抓取完成,获得 ${nextPageData.length} 条数据。`);
|
||||
|
||||
if (nextPageData.length === 0) {
|
||||
console.log(`第 ${i} 页没有数据,抓取结束。`);
|
||||
break;
|
||||
}
|
||||
allData = allData.concat(nextPageData);
|
||||
}
|
||||
|
||||
await fs.writeFile('data.json', JSON.stringify(allData, null, 4), 'utf-8');
|
||||
console.log(`\n抓取全站数据完毕!共 ${allData.length} 条记录已保存至 data.json 文件。`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('抓取过程中发生错误:', error.message);
|
||||
if (error.response) {
|
||||
console.error('Status:', error.response.status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Loading…
Reference in New Issue
Block a user