puning-real-estate/scripts/getPreSaleLicense.js

import axios from './axios.js';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';

const BASE_URL = 'http://120.236.48.169:89/HPMS/';
const START_URL = BASE_URL + 'presellCertList.aspx';

// Extracts table data from a given HTML content
function extractDataFromHtml($) {
    const data = [];
    // Corrected selector to find rows with table data, skipping the header
    const rows = $('.resultlist table tr:has(td)');

    rows.each((i, row) => {
        const columns = $(row).find('td');
        // Based on debug.html, the structure is different and has 9 columns
        if (columns.length >= 9) {
            const licenseCell = $(columns[1]); // 许可证号 is the 2nd column
            const licenseLinkTag = licenseCell.find('a');

            const rowData = {
                '序号': $(columns[0]).text().trim(),
                '许可证号': licenseLinkTag.text().trim(),
                '开发企业': $(columns[2]).text().trim(),
                '项目名称': $(columns[3]).text().trim(),
                '项目地址': $(columns[4]).text().trim(),
                '批准时间': $(columns[5]).text().trim(),
                '所在区域': $(columns[6]).text().trim(),
                '总套数': $(columns[7]).text().trim(),
                '可售套数': $(columns[8]).text().trim(),
                '许可证链接': '', // Initialize
            };

            // The link is inside an onclick attribute, not a standard href
            const onclickAttr = licenseLinkTag.attr('onclick');
            if (onclickAttr) {
                // Make regex flexible to handle single or double quotes
                const match = onclickAttr.match(/open_click\(['"]([^'"]+)['"]\)/);
                if (match && match[1]) {
                    // match[1] will be like '/HPMS/PresellDetailsInfo.aspx?id=1012110'
                    // We need to resolve it against the origin, not the full BASE_URL path
                    const origin = new URL(BASE_URL).origin;
                    rowData['许可证链接'] = new URL(match[1], origin).href;
                }
            }
            data.push(rowData);
        }
    });
    return data;
}

/**
 * Scrapes all data from the target website, handling pagination.
 * @returns {Promise<Array<Object>>} - An array of all scraped data records.
 */
async function scrapeWebsite() {
    let allData = [];
    console.log('开始抓取第一页数据...');
    let response = await axios.get(START_URL);

    let $ = cheerio.load(response.data);
    allData = extractDataFromHtml($);
    console.log(`第一页抓取完成，获得 ${allData.length} 条数据。`);

    const pageCountSpan = $('#PageNavigator1_LblPageCount');
    const totalPages = pageCountSpan.length ? parseInt(pageCountSpan.text(), 10) : 1;
    console.log(`共发现 ${totalPages} 页。`);

    const formValues = {};
    $('input[name^="txt"], select').each((idx, el) => {
        const name = $(el).attr('name');
        if (name) {
            formValues[name] = $(el).val() || '';
        }
    });

    for (let i = 2; i <= totalPages; i++) {
        console.log(`正在抓取第 ${i} 页...`);

        const viewState = $('#__VIEWSTATE').val();
        const viewStateGenerator = $('#__VIEWSTATEGENERATOR').val();
        const eventValidation = $('#__EVENTVALIDATION').val();

        if (!viewState) {
            console.log('无法找到 __VIEWSTATE，终止抓取。');
            break;
        }

        const postData = new URLSearchParams();
        postData.append('__EVENTTARGET', 'PageNavigator1$LnkBtnNext');
        postData.append('__EVENTARGUMENT', '');
        postData.append('__VIEWSTATE', viewState);
        postData.append('__VIEWSTATEGENERATOR', viewStateGenerator);
        if(eventValidation) {
            postData.append('__EVENTVALIDATION', eventValidation);
        }

        for (const name in formValues) {
            postData.append(name, formValues[name]);
        }
        postData.append('PageNavigator1$txtNewPageIndex', (i - 1).toString());

        response = await axios.post(START_URL, postData, {
            headers: {
                'Content-Type': 'application/x-www-form-urlencoded',
                'Referer': START_URL,
            }
        });

        $ = cheerio.load(response.data);
        const nextPageData = extractDataFromHtml($);
        console.log(`第 ${i} 页抓取完成，获得 ${nextPageData.length} 条数据。`);

        if (nextPageData.length === 0) {
            console.log(`第 ${i} 页没有数据，抓取结束。`);
            break;
        }
        allData = allData.concat(nextPageData);
    }
    console.log(`\n抓取全站数据完毕！共 ${allData.length} 条原始记录。`);
    return allData;
}


/**
 * Performs post-processing on the scraped data:
 * 1. Filters out records where "许可证号" is "空".
 * 2. Re-indexes "序号" sequentially.
 * 3. Converts "总套数" and "可售套数" fields to numbers.
 * @param {Array<Object>} allData - The raw scraped data.
 * @returns {Promise<Array<Object>>} - The processed data.
 */
async function processScrapedData(allData) {
    // 1. Filter out records where "许可证号" is "空"
    let processedData = allData.filter(record => record['许可证号'] !== '空');
    console.log(`删除 "许可证号" 为 "空" 的记录后，剩余 ${processedData.length} 条记录。`);

    // 2. Re-index "序号" sequentially
    for (let i = 0; i < processedData.length; i++) {
        processedData[i]['序号'] = (i + 1).toString();
    }
    console.log('序号字段已重新编号。');

    // 3. Convert "总套数" and "可售套数" to numbers
    for (const record of processedData) {
        record['总套数'] = parseInt(record['总套数'], 10) || 0;
        record['可售套数'] = parseInt(record['可售套数'], 10) || 0;
    }
    console.log('"总套数" 和 "可售套数" 字段已转换为数字。');

    return processedData;
}

// 主函数 - 导出以便在根目录调用
export async function main() {
    try {
        const allData = await scrapeWebsite();

        const processedData = await processScrapedData(allData);

        const dataPath = path.join(path.dirname(new URL(import.meta.url).pathname), '..', 'data', 'preSaleLicense.json');
        await fs.writeFile(dataPath, JSON.stringify(processedData, null, 4), 'utf-8');
        console.log(`更新后的数据已保存至 data/preSaleLicense.json 文件。`);

        console.log('\n所有数据处理和文件生成任务已完成。');

    } catch (error) {
        console.error('抓取或处理过程中发生错误:', error.message);
        if (error.response) {
            console.error('Status:', error.response.status);
        }
    }
}