通过Playwright抓取数据
This commit is contained in:
parent
7677ca9e3e
commit
e8239d2099
70
README.md
Normal file
70
README.md
Normal file
@ -0,0 +1,70 @@
|
||||
# 广东省公共资源交易平台爬虫
|
||||
|
||||
本项目是一个 Python 爬虫程序,用于抓取 [广东省公共资源交易平台](https://ygp.gdzwfw.gov.cn/#/44/jygg) 的中标结果公告。
|
||||
|
||||
## 功能特性
|
||||
|
||||
- **关键字过滤**:自动筛选标题中包含“中标结果”的公告。
|
||||
- **日期过滤**:支持指定开始和结束日期,默认为抓取当天数据。
|
||||
- **自动分页**:自动处理多页数据抓取。
|
||||
- **动态构造 URL**:根据接口返回字段自动生成可直接访问的详情页链接。
|
||||
- **处理动态加密**:利用 Playwright 驱动浏览器,自动处理 API 请求头中的加密签名。
|
||||
|
||||
## 环境要求
|
||||
|
||||
- Python 3.8+
|
||||
- Chromium 浏览器 (由 Playwright 自动安装)
|
||||
|
||||
## 安装步骤
|
||||
|
||||
1. **克隆项目并进入目录**
|
||||
```bash
|
||||
cd ygp-gdzwfw-gov-cn
|
||||
```
|
||||
|
||||
2. **创建并激活虚拟环境 (推荐)**
|
||||
```bash
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate # macOS/Linux
|
||||
# 或 venv\Scripts\activate # Windows
|
||||
```
|
||||
|
||||
3. **安装依赖**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. **安装 Playwright 浏览器内核**
|
||||
```bash
|
||||
playwright install chromium
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
### 1. 抓取今天发布的数据 (默认)
|
||||
直接运行脚本,程序将自动抓取发布日期为今天的“中标结果”公告。
|
||||
```bash
|
||||
python ygp_crawler.py
|
||||
```
|
||||
|
||||
### 2. 抓取指定日期范围
|
||||
使用 `--start-date` 和 `--end-date` 参数(格式:`YYYY-MM-DD`)。
|
||||
```bash
|
||||
python ygp_crawler.py --start-date 2026-02-01 --end-date 2026-02-04
|
||||
```
|
||||
|
||||
### 3. 保存抓取结果
|
||||
脚本将结果以 JSON 格式输出到控制台。可以使用重定向将其保存到文件中。
|
||||
```bash
|
||||
python ygp_crawler.py --start-date 2026-02-01 > results.jsonl
|
||||
```
|
||||
|
||||
## 数据输出示例
|
||||
|
||||
```json
|
||||
{
|
||||
"项目标题": "某某项目中标结果公示",
|
||||
"发布时间": "20260204173002",
|
||||
"详细链接": "https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A?noticeId=..."
|
||||
}
|
||||
```
|
||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
playwright
|
||||
199
ygp_crawler.py
Normal file
199
ygp_crawler.py
Normal file
@ -0,0 +1,199 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import urllib.parse
|
||||
from datetime import datetime, date
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="广东省公共资源交易平台数据抓取")
|
||||
parser.add_argument("--start-date", help="开始日期 (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", help="结束日期 (YYYY-MM-DD)")
|
||||
return parser.parse_args()
|
||||
|
||||
def parse_api_date(date_str):
|
||||
"""Parses date string from API (YYYYMMDDHHMMSS) to date object."""
|
||||
if not date_str:
|
||||
return None
|
||||
try:
|
||||
return datetime.strptime(date_str, "%Y%m%d%H%M%S").date()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
def construct_detail_url(item):
|
||||
"""Constructs the detail page URL based on item data."""
|
||||
# Pattern derived from analysis:
|
||||
# https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A?noticeId=...
|
||||
|
||||
base_url = "https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A"
|
||||
|
||||
# Map API fields to URL parameters
|
||||
# Note: 'nodeId' is missing as it's not in the list response.
|
||||
# We assume the page handles missing nodeId or it's not strictly required for direct access.
|
||||
params = {
|
||||
"noticeId": item.get("noticeId", ""),
|
||||
"projectCode": item.get("projectCode", ""),
|
||||
"bizCode": item.get("tradingProcess", ""),
|
||||
"siteCode": item.get("siteCode", ""),
|
||||
"publishDate": item.get("publishDate", ""),
|
||||
"source": item.get("pubServicePlat", ""),
|
||||
"titleDetails": item.get("noticeSecondTypeDesc", ""),
|
||||
"classify": item.get("projectType", "")
|
||||
}
|
||||
|
||||
query = urllib.parse.urlencode(params)
|
||||
return f"{base_url}?{query}"
|
||||
|
||||
async def run():
|
||||
args = parse_args()
|
||||
|
||||
# Determine Date Range
|
||||
today = date.today()
|
||||
start_date = today
|
||||
end_date = today
|
||||
|
||||
if args.start_date:
|
||||
try:
|
||||
start_date = datetime.strptime(args.start_date, "%Y-%m-%d").date()
|
||||
except ValueError:
|
||||
print(f"Error: Invalid start date format: {args.start_date}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.end_date:
|
||||
try:
|
||||
end_date = datetime.strptime(args.end_date, "%Y-%m-%d").date()
|
||||
except ValueError:
|
||||
print(f"Error: Invalid end date format: {args.end_date}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if start_date > end_date:
|
||||
print(f"Error: Start date {start_date} is after end date {end_date}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Crawling range: {start_date} to {end_date}", file=sys.stderr)
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
viewport={"width": 1280, "height": 800}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
# Helper to process a batch of items
|
||||
def process_items(items):
|
||||
page_results = []
|
||||
stop_signal = False
|
||||
min_date_on_page = None
|
||||
|
||||
for item in items:
|
||||
title = item.get("noticeTitle", "")
|
||||
pub_date_str = item.get("publishDate", "")
|
||||
item_date = parse_api_date(pub_date_str)
|
||||
|
||||
if item_date:
|
||||
if min_date_on_page is None or item_date < min_date_on_page:
|
||||
min_date_on_page = item_date
|
||||
|
||||
# Date Filter
|
||||
if item_date > end_date:
|
||||
continue # Too new, skip
|
||||
if item_date < start_date:
|
||||
# Found an item older than start date.
|
||||
# Since lists are usually ordered, this suggests we might be done.
|
||||
# However, to be safe (pinned items?), we just skip it here,
|
||||
# but set a signal that we *might* want to stop if the whole page is old.
|
||||
pass
|
||||
|
||||
# Keyword Filter
|
||||
if "中标结果" in title:
|
||||
# Add to results
|
||||
page_results.append({
|
||||
"项目标题": title,
|
||||
"发布时间": pub_date_str,
|
||||
"详细链接": construct_detail_url(item)
|
||||
})
|
||||
|
||||
# Stop condition: If the newest item on the page (or min_date) is older than start_date?
|
||||
# Actually, valid items could be anywhere if not strictly sorted.
|
||||
# But "min_date_on_page < start_date" means the page contains items older than target.
|
||||
# If the *entire* page is older than start_date, we definitely stop.
|
||||
# Let's assume strict reverse chronological order for efficiency.
|
||||
if min_date_on_page and min_date_on_page < start_date:
|
||||
stop_signal = True
|
||||
|
||||
return page_results, stop_signal
|
||||
|
||||
try:
|
||||
print("Loading page...", file=sys.stderr)
|
||||
|
||||
# Setup response listener for the INITIAL load
|
||||
async with page.expect_response("**/search/v2/items", timeout=15000) as response_info:
|
||||
await page.goto("https://ygp.gdzwfw.gov.cn/#/44/jygg")
|
||||
|
||||
response = await response_info.value
|
||||
data = await response.json()
|
||||
items = data.get("data", {}).get("pageData", [])
|
||||
|
||||
results, stop = process_items(items)
|
||||
for r in results:
|
||||
print(json.dumps(r, ensure_ascii=False))
|
||||
|
||||
if stop:
|
||||
print("Date range satisfied (initial page). Stopping.", file=sys.stderr)
|
||||
await browser.close()
|
||||
return
|
||||
|
||||
# Pagination Loop
|
||||
page_num = 1
|
||||
while True:
|
||||
page_num += 1
|
||||
print(f"Processing page {page_num}...", file=sys.stderr)
|
||||
|
||||
# Find Next Button
|
||||
# Selector strategy: The pagination 'next' button.
|
||||
# Usually .btn-next
|
||||
next_btn = page.locator(".btn-next")
|
||||
|
||||
# Check if disabled
|
||||
if await next_btn.get_attribute("disabled") is not None:
|
||||
print("Reached last page.", file=sys.stderr)
|
||||
break
|
||||
|
||||
# Click and Wait for API
|
||||
try:
|
||||
async with page.expect_response("**/search/v2/items", timeout=10000) as response_info:
|
||||
await next_btn.click()
|
||||
|
||||
response = await response_info.value
|
||||
if response.status != 200:
|
||||
print(f"API Error: {response.status}", file=sys.stderr)
|
||||
break
|
||||
|
||||
data = await response.json()
|
||||
items = data.get("data", {}).get("pageData", [])
|
||||
|
||||
if not items:
|
||||
print("No more items.", file=sys.stderr)
|
||||
break
|
||||
|
||||
results, stop = process_items(items)
|
||||
for r in results:
|
||||
print(json.dumps(r, ensure_ascii=False))
|
||||
|
||||
if stop:
|
||||
print("Date range satisfied. Stopping.", file=sys.stderr)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during pagination: {e}", file=sys.stderr)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"Fatal Error: {e}", file=sys.stderr)
|
||||
|
||||
await browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run())
|
||||
Loading…
Reference in New Issue
Block a user