增量爬取,数据排序

This commit is contained in:
秦秋旭 2026-02-04 21:14:18 +08:00
parent 8d302a8b55
commit 4a458a897b
2 changed files with 166 additions and 53 deletions

View File

@ -11,6 +11,8 @@
- **纯 HTTP 请求**:直接使用 aiohttp 调用官方 API无需浏览器轻量高效。 - **纯 HTTP 请求**:直接使用 aiohttp 调用官方 API无需浏览器轻量高效。
- **CSV 实时保存**:数据实时保存到 CSV 文件,同时输出到终端。 - **CSV 实时保存**:数据实时保存到 CSV 文件,同时输出到终端。
- **自定义输出路径**:支持通过参数指定输出文件路径。 - **自定义输出路径**:支持通过参数指定输出文件路径。
- **增量爬取**:支持从已有 CSV 文件自动计算时间范围,只抓取新数据,避免重复。
- **数据排序**:新数据在前,旧数据在后,按发布时间倒序排列。
## 环境要求 ## 环境要求
@ -55,6 +57,27 @@ python ygp_crawler.py --start-date 2026-02-01 --end-date 2026-02-04
python ygp_crawler.py --start-date 2026-02-01 --end-date 2026-02-04 -o my_data.csv python ygp_crawler.py --start-date 2026-02-01 --end-date 2026-02-04 -o my_data.csv
``` ```
### 4. 增量爬取
使用 `-i``--incremental` 参数启用增量爬取模式。脚本会自动读取已有 CSV 文件,计算时间范围,只抓取新数据。
**自动计算日期范围(推荐)**
```bash
# 自动从已有数据的最新日期+1开始爬取到今天
python ygp_crawler.py -i
```
**手动指定日期范围**
```bash
# 在增量模式下手动指定日期范围
python ygp_crawler.py -i --start-date 2026-02-01 --end-date 2026-02-04
```
**增量爬取特性**
- 自动识别已有 CSV 文件中的数据时间范围
- 自动去重(根据详情链接判断)
- 新数据插入到文件前面,旧数据保留在后面
- 如果无新数据可抓取,自动退出
## 数据输出示例 ## 数据输出示例
### 终端输出JSON 格式,便于阅读) ### 终端输出JSON 格式,便于阅读)

View File

@ -2,9 +2,10 @@ import argparse
import asyncio import asyncio
import csv import csv
import json import json
import os
import sys import sys
import urllib.parse import urllib.parse
from datetime import datetime, date from datetime import datetime, date, timedelta
import aiohttp import aiohttp
@ -12,6 +13,7 @@ async def delay(ms: int):
"""Async delay in milliseconds.""" """Async delay in milliseconds."""
await asyncio.sleep(ms / 1000) await asyncio.sleep(ms / 1000)
API_BASE_URL = "https://ygp.gdzwfw.gov.cn/ggzy-portal" API_BASE_URL = "https://ygp.gdzwfw.gov.cn/ggzy-portal"
@ -20,6 +22,7 @@ def parse_args():
parser.add_argument("--start-date", help="开始日期 (YYYY-MM-DD)") parser.add_argument("--start-date", help="开始日期 (YYYY-MM-DD)")
parser.add_argument("--end-date", help="结束日期 (YYYY-MM-DD)") parser.add_argument("--end-date", help="结束日期 (YYYY-MM-DD)")
parser.add_argument("--output", "-o", default="results.csv", help="输出CSV文件路径 (默认: results.csv)") parser.add_argument("--output", "-o", default="results.csv", help="输出CSV文件路径 (默认: results.csv)")
parser.add_argument("--incremental", "-i", action="store_true", help="启用增量爬取模式")
return parser.parse_args() return parser.parse_args()
@ -33,6 +36,16 @@ def parse_api_date(date_str):
return None return None
def parse_csv_datetime(date_str):
"""Parses CSV datetime string (YYYY-MM-DD HH:mm:ss) to date object."""
if not date_str:
return None
try:
return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").date()
except ValueError:
return None
def format_datetime(date_str): def format_datetime(date_str):
"""Formats API date string (YYYYMMDDHHMMSS) to YYYY-MM-DD HH:mm:ss.""" """Formats API date string (YYYYMMDDHHMMSS) to YYYY-MM-DD HH:mm:ss."""
if not date_str: if not date_str:
@ -44,6 +57,44 @@ def format_datetime(date_str):
return date_str return date_str
def read_existing_csv(csv_path):
"""
Reads existing CSV file and returns:
- existing_data: list of rows (excluding header)
- min_date: oldest date in the file
- max_date: newest date in the file
"""
if not os.path.exists(csv_path):
return [], None, None
existing_data = []
min_date = None
max_date = None
try:
with open(csv_path, "r", encoding="utf-8", newline="") as f:
reader = csv.reader(f)
header = next(reader, None) # Skip header
if not header:
return [], None, None
for row in reader:
if len(row) >= 2:
existing_data.append(row)
# Parse date from row (index 1 is 发布时间)
row_date = parse_csv_datetime(row[1])
if row_date:
if min_date is None or row_date < min_date:
min_date = row_date
if max_date is None or row_date > max_date:
max_date = row_date
except Exception as e:
print(f"Warning: Error reading existing CSV: {e}", file=sys.stderr)
return [], None, None
return existing_data, min_date, max_date
def construct_detail_url(item): def construct_detail_url(item):
"""Constructs the detail page URL based on item data. """Constructs the detail page URL based on item data.
@ -75,10 +126,7 @@ def construct_detail_url(item):
def build_search_payload(page_num=1, page_size=10): def build_search_payload(page_num=1, page_size=10):
"""Build the search API payload. """Build the search API payload."""
Based on analysis of the frontend code (JyggFilter component).
"""
return { return {
"pageNo": page_num, "pageNo": page_num,
"pageSize": page_size, "pageSize": page_size,
@ -162,6 +210,23 @@ async def fetch_page(session, page_num, page_size=10):
return None return None
def deduplicate_results(new_results, existing_data):
"""Remove duplicates from new_results based on existing_data."""
# Create a set of existing URLs for fast lookup
existing_urls = set()
for row in existing_data:
if len(row) >= 3:
existing_urls.add(row[2]) # 详细链接 is the 3rd column
# Filter out duplicates
unique_results = []
for r in new_results:
if r["详细链接"] not in existing_urls:
unique_results.append(r)
return unique_results
async def run(): async def run():
args = parse_args() args = parse_args()
@ -169,12 +234,27 @@ async def run():
start_date = today start_date = today
end_date = today end_date = today
# Read existing CSV if in incremental mode
existing_data = []
csv_min_date = None
csv_max_date = None
if args.incremental and os.path.exists(args.output):
existing_data, csv_min_date, csv_max_date = read_existing_csv(args.output)
if csv_min_date and csv_max_date:
print(f"Existing data range: {csv_min_date} to {csv_max_date}", file=sys.stderr)
print(f"Existing records: {len(existing_data)}", file=sys.stderr)
# Determine date range
if args.start_date: if args.start_date:
try: try:
start_date = datetime.strptime(args.start_date, "%Y-%m-%d").date() start_date = datetime.strptime(args.start_date, "%Y-%m-%d").date()
except ValueError: except ValueError:
print(f"Error: Invalid start date format: {args.start_date}", file=sys.stderr) print(f"Error: Invalid start date format: {args.start_date}", file=sys.stderr)
sys.exit(1) sys.exit(1)
elif args.incremental and csv_max_date:
# In incremental mode without explicit start_date, fetch from max_date+1 to today
start_date = csv_max_date + timedelta(days=1)
if args.end_date: if args.end_date:
try: try:
@ -184,27 +264,25 @@ async def run():
sys.exit(1) sys.exit(1)
if start_date > end_date: if start_date > end_date:
print(f"Error: Start date {start_date} is after end date {end_date}", file=sys.stderr) print(f"Info: Start date {start_date} is after end date {end_date}, no new data to fetch.", file=sys.stderr)
sys.exit(1) print(f"Existing records: {len(existing_data)}", file=sys.stderr)
sys.exit(0)
print(f"Crawling range: {start_date} to {end_date}", file=sys.stderr) print(f"Crawling range: {start_date} to {end_date}", file=sys.stderr)
print(f"Output file: {args.output}", file=sys.stderr) print(f"Output file: {args.output}", file=sys.stderr)
if args.incremental:
print(f"Incremental mode: ON", file=sys.stderr)
# Open CSV file and write header # Collect all new results first
csv_file = open(args.output, "w", newline="", encoding="utf-8") new_results = []
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["项目标题", "发布时间", "详细链接"])
csv_file.flush()
try:
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
page_num = 1 page_num = 1
total_results = 0
while True: while True:
print(f"Processing page {page_num}...", file=sys.stderr) print(f"Processing page {page_num}...", file=sys.stderr)
await delay(500) # Initial delay before first request too await delay(500)
resp = await fetch_page(session, page_num) resp = await fetch_page(session, page_num)
@ -212,7 +290,6 @@ async def run():
print("Failed to fetch data. Stopping.", file=sys.stderr) print("Failed to fetch data. Stopping.", file=sys.stderr)
break break
# API returns {errcode, errmsg, data}
data = resp.get("data", {}) data = resp.get("data", {})
items = data.get("pageData", []) items = data.get("pageData", [])
@ -221,23 +298,17 @@ async def run():
break break
results, stop = process_items(items, start_date, end_date) results, stop = process_items(items, start_date, end_date)
new_results.extend(results)
# Print to console immediately
for r in results: for r in results:
# Print to console (as JSON for readability)
print(json.dumps(r, ensure_ascii=False)) print(json.dumps(r, ensure_ascii=False))
sys.stdout.flush() sys.stdout.flush()
# Write to CSV immediately
csv_writer.writerow([r["项目标题"], r["发布时间"], r["详细链接"]])
csv_file.flush()
total_results += 1
if stop: if stop:
print("Date range satisfied. Stopping.", file=sys.stderr) print("Date range satisfied. Stopping.", file=sys.stderr)
break break
# Check if we've reached the last page
# API returns pageTotal, not pages
total = int(data.get("total", 0))
pages = data.get("pageTotal", 0) pages = data.get("pageTotal", 0)
if page_num >= pages: if page_num >= pages:
@ -245,11 +316,30 @@ async def run():
break break
page_num += 1 page_num += 1
await delay(1000) # 1s delay between requests await delay(1000)
print(f"\nTotal results saved: {total_results}", file=sys.stderr) print(f"\nNew results fetched: {len(new_results)}", file=sys.stderr)
finally:
csv_file.close() # Deduplicate if in incremental mode
if args.incremental and existing_data:
new_results = deduplicate_results(new_results, existing_data)
print(f"After deduplication: {len(new_results)}", file=sys.stderr)
# Write to CSV: new data first, then existing data
with open(args.output, "w", newline="", encoding="utf-8") as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["项目标题", "发布时间", "详细链接"])
# Write new results first (newer data)
for r in new_results:
csv_writer.writerow([r["项目标题"], r["发布时间"], r["详细链接"]])
# Write existing data (older data)
for row in existing_data:
csv_writer.writerow(row[:3]) # Only write first 3 columns
total_records = len(new_results) + len(existing_data)
print(f"Total records in CSV: {total_records}", file=sys.stderr)
if __name__ == "__main__": if __name__ == "__main__":