From e8239d2099b5c98e359c1888340a3f41ca1b848c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=A6=E7=A7=8B=E6=97=AD?= Date: Wed, 4 Feb 2026 19:03:09 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=9A=E8=BF=87Playwright=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 70 +++++++++++++++++ requirements.txt | 1 + ygp_crawler.py | 199 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 270 insertions(+) create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 ygp_crawler.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f2fa0f5 --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# 广东省公共资源交易平台爬虫 + +本项目是一个 Python 爬虫程序,用于抓取 [广东省公共资源交易平台](https://ygp.gdzwfw.gov.cn/#/44/jygg) 的中标结果公告。 + +## 功能特性 + +- **关键字过滤**:自动筛选标题中包含“中标结果”的公告。 +- **日期过滤**:支持指定开始和结束日期,默认为抓取当天数据。 +- **自动分页**:自动处理多页数据抓取。 +- **动态构造 URL**:根据接口返回字段自动生成可直接访问的详情页链接。 +- **处理动态加密**:利用 Playwright 驱动浏览器,自动处理 API 请求头中的加密签名。 + +## 环境要求 + +- Python 3.8+ +- Chromium 浏览器 (由 Playwright 自动安装) + +## 安装步骤 + +1. **克隆项目并进入目录** + ```bash + cd ygp-gdzwfw-gov-cn + ``` + +2. **创建并激活虚拟环境 (推荐)** + ```bash + python3 -m venv venv + source venv/bin/activate # macOS/Linux + # 或 venv\Scripts\activate # Windows + ``` + +3. **安装依赖** + ```bash + pip install -r requirements.txt + ``` + +4. **安装 Playwright 浏览器内核** + ```bash + playwright install chromium + ``` + +## 使用方法 + +### 1. 抓取今天发布的数据 (默认) +直接运行脚本,程序将自动抓取发布日期为今天的“中标结果”公告。 +```bash +python ygp_crawler.py +``` + +### 2. 抓取指定日期范围 +使用 `--start-date` 和 `--end-date` 参数(格式:`YYYY-MM-DD`)。 +```bash +python ygp_crawler.py --start-date 2026-02-01 --end-date 2026-02-04 +``` + +### 3. 保存抓取结果 +脚本将结果以 JSON 格式输出到控制台。可以使用重定向将其保存到文件中。 +```bash +python ygp_crawler.py --start-date 2026-02-01 > results.jsonl +``` + +## 数据输出示例 + +```json +{ + "项目标题": "某某项目中标结果公示", + "发布时间": "20260204173002", + "详细链接": "https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A?noticeId=..." +} +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..508a5f4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +playwright diff --git a/ygp_crawler.py b/ygp_crawler.py new file mode 100644 index 0000000..5719b9e --- /dev/null +++ b/ygp_crawler.py @@ -0,0 +1,199 @@ +import argparse +import asyncio +import json +import sys +import urllib.parse +from datetime import datetime, date +from playwright.async_api import async_playwright + +def parse_args(): + parser = argparse.ArgumentParser(description="广东省公共资源交易平台数据抓取") + parser.add_argument("--start-date", help="开始日期 (YYYY-MM-DD)") + parser.add_argument("--end-date", help="结束日期 (YYYY-MM-DD)") + return parser.parse_args() + +def parse_api_date(date_str): + """Parses date string from API (YYYYMMDDHHMMSS) to date object.""" + if not date_str: + return None + try: + return datetime.strptime(date_str, "%Y%m%d%H%M%S").date() + except ValueError: + return None + +def construct_detail_url(item): + """Constructs the detail page URL based on item data.""" + # Pattern derived from analysis: + # https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A?noticeId=... + + base_url = "https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A" + + # Map API fields to URL parameters + # Note: 'nodeId' is missing as it's not in the list response. + # We assume the page handles missing nodeId or it's not strictly required for direct access. + params = { + "noticeId": item.get("noticeId", ""), + "projectCode": item.get("projectCode", ""), + "bizCode": item.get("tradingProcess", ""), + "siteCode": item.get("siteCode", ""), + "publishDate": item.get("publishDate", ""), + "source": item.get("pubServicePlat", ""), + "titleDetails": item.get("noticeSecondTypeDesc", ""), + "classify": item.get("projectType", "") + } + + query = urllib.parse.urlencode(params) + return f"{base_url}?{query}" + +async def run(): + args = parse_args() + + # Determine Date Range + today = date.today() + start_date = today + end_date = today + + if args.start_date: + try: + start_date = datetime.strptime(args.start_date, "%Y-%m-%d").date() + except ValueError: + print(f"Error: Invalid start date format: {args.start_date}", file=sys.stderr) + sys.exit(1) + + if args.end_date: + try: + end_date = datetime.strptime(args.end_date, "%Y-%m-%d").date() + except ValueError: + print(f"Error: Invalid end date format: {args.end_date}", file=sys.stderr) + sys.exit(1) + + if start_date > end_date: + print(f"Error: Start date {start_date} is after end date {end_date}", file=sys.stderr) + sys.exit(1) + + print(f"Crawling range: {start_date} to {end_date}", file=sys.stderr) + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + context = await browser.new_context( + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800} + ) + page = await context.new_page() + + # Helper to process a batch of items + def process_items(items): + page_results = [] + stop_signal = False + min_date_on_page = None + + for item in items: + title = item.get("noticeTitle", "") + pub_date_str = item.get("publishDate", "") + item_date = parse_api_date(pub_date_str) + + if item_date: + if min_date_on_page is None or item_date < min_date_on_page: + min_date_on_page = item_date + + # Date Filter + if item_date > end_date: + continue # Too new, skip + if item_date < start_date: + # Found an item older than start date. + # Since lists are usually ordered, this suggests we might be done. + # However, to be safe (pinned items?), we just skip it here, + # but set a signal that we *might* want to stop if the whole page is old. + pass + + # Keyword Filter + if "中标结果" in title: + # Add to results + page_results.append({ + "项目标题": title, + "发布时间": pub_date_str, + "详细链接": construct_detail_url(item) + }) + + # Stop condition: If the newest item on the page (or min_date) is older than start_date? + # Actually, valid items could be anywhere if not strictly sorted. + # But "min_date_on_page < start_date" means the page contains items older than target. + # If the *entire* page is older than start_date, we definitely stop. + # Let's assume strict reverse chronological order for efficiency. + if min_date_on_page and min_date_on_page < start_date: + stop_signal = True + + return page_results, stop_signal + + try: + print("Loading page...", file=sys.stderr) + + # Setup response listener for the INITIAL load + async with page.expect_response("**/search/v2/items", timeout=15000) as response_info: + await page.goto("https://ygp.gdzwfw.gov.cn/#/44/jygg") + + response = await response_info.value + data = await response.json() + items = data.get("data", {}).get("pageData", []) + + results, stop = process_items(items) + for r in results: + print(json.dumps(r, ensure_ascii=False)) + + if stop: + print("Date range satisfied (initial page). Stopping.", file=sys.stderr) + await browser.close() + return + + # Pagination Loop + page_num = 1 + while True: + page_num += 1 + print(f"Processing page {page_num}...", file=sys.stderr) + + # Find Next Button + # Selector strategy: The pagination 'next' button. + # Usually .btn-next + next_btn = page.locator(".btn-next") + + # Check if disabled + if await next_btn.get_attribute("disabled") is not None: + print("Reached last page.", file=sys.stderr) + break + + # Click and Wait for API + try: + async with page.expect_response("**/search/v2/items", timeout=10000) as response_info: + await next_btn.click() + + response = await response_info.value + if response.status != 200: + print(f"API Error: {response.status}", file=sys.stderr) + break + + data = await response.json() + items = data.get("data", {}).get("pageData", []) + + if not items: + print("No more items.", file=sys.stderr) + break + + results, stop = process_items(items) + for r in results: + print(json.dumps(r, ensure_ascii=False)) + + if stop: + print("Date range satisfied. Stopping.", file=sys.stderr) + break + + except Exception as e: + print(f"Error during pagination: {e}", file=sys.stderr) + break + + except Exception as e: + print(f"Fatal Error: {e}", file=sys.stderr) + + await browser.close() + +if __name__ == "__main__": + asyncio.run(run())