import argparse import asyncio import json import sys import urllib.parse from datetime import datetime, date from playwright.async_api import async_playwright def parse_args(): parser = argparse.ArgumentParser(description="广东省公共资源交易平台数据抓取") parser.add_argument("--start-date", help="开始日期 (YYYY-MM-DD)") parser.add_argument("--end-date", help="结束日期 (YYYY-MM-DD)") return parser.parse_args() def parse_api_date(date_str): """Parses date string from API (YYYYMMDDHHMMSS) to date object.""" if not date_str: return None try: return datetime.strptime(date_str, "%Y%m%d%H%M%S").date() except ValueError: return None def construct_detail_url(item): """Constructs the detail page URL based on item data.""" # Pattern derived from analysis: # https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A?noticeId=... base_url = "https://ygp.gdzwfw.gov.cn/#/44/new/jygg/v3/A" # Map API fields to URL parameters # Note: 'nodeId' is missing as it's not in the list response. # We assume the page handles missing nodeId or it's not strictly required for direct access. params = { "noticeId": item.get("noticeId", ""), "projectCode": item.get("projectCode", ""), "bizCode": item.get("tradingProcess", ""), "siteCode": item.get("siteCode", ""), "publishDate": item.get("publishDate", ""), "source": item.get("pubServicePlat", ""), "titleDetails": item.get("noticeSecondTypeDesc", ""), "classify": item.get("projectType", "") } query = urllib.parse.urlencode(params) return f"{base_url}?{query}" async def run(): args = parse_args() # Determine Date Range today = date.today() start_date = today end_date = today if args.start_date: try: start_date = datetime.strptime(args.start_date, "%Y-%m-%d").date() except ValueError: print(f"Error: Invalid start date format: {args.start_date}", file=sys.stderr) sys.exit(1) if args.end_date: try: end_date = datetime.strptime(args.end_date, "%Y-%m-%d").date() except ValueError: print(f"Error: Invalid end date format: {args.end_date}", file=sys.stderr) sys.exit(1) if start_date > end_date: print(f"Error: Start date {start_date} is after end date {end_date}", file=sys.stderr) sys.exit(1) print(f"Crawling range: {start_date} to {end_date}", file=sys.stderr) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context( user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={"width": 1280, "height": 800} ) page = await context.new_page() # Helper to process a batch of items def process_items(items): page_results = [] stop_signal = False min_date_on_page = None for item in items: title = item.get("noticeTitle", "") pub_date_str = item.get("publishDate", "") item_date = parse_api_date(pub_date_str) if item_date: if min_date_on_page is None or item_date < min_date_on_page: min_date_on_page = item_date # Date Filter if item_date > end_date: continue # Too new, skip if item_date < start_date: # Found an item older than start date. # Since lists are usually ordered, this suggests we might be done. # However, to be safe (pinned items?), we just skip it here, # but set a signal that we *might* want to stop if the whole page is old. pass # Keyword Filter if "中标结果" in title: # Add to results page_results.append({ "项目标题": title, "发布时间": pub_date_str, "详细链接": construct_detail_url(item) }) # Stop condition: If the newest item on the page (or min_date) is older than start_date? # Actually, valid items could be anywhere if not strictly sorted. # But "min_date_on_page < start_date" means the page contains items older than target. # If the *entire* page is older than start_date, we definitely stop. # Let's assume strict reverse chronological order for efficiency. if min_date_on_page and min_date_on_page < start_date: stop_signal = True return page_results, stop_signal try: print("Loading page...", file=sys.stderr) # Setup response listener for the INITIAL load async with page.expect_response("**/search/v2/items", timeout=15000) as response_info: await page.goto("https://ygp.gdzwfw.gov.cn/#/44/jygg") response = await response_info.value data = await response.json() items = data.get("data", {}).get("pageData", []) results, stop = process_items(items) for r in results: print(json.dumps(r, ensure_ascii=False)) if stop: print("Date range satisfied (initial page). Stopping.", file=sys.stderr) await browser.close() return # Pagination Loop page_num = 1 while True: page_num += 1 print(f"Processing page {page_num}...", file=sys.stderr) # Find Next Button # Selector strategy: The pagination 'next' button. # Usually .btn-next next_btn = page.locator(".btn-next") # Check if disabled if await next_btn.get_attribute("disabled") is not None: print("Reached last page.", file=sys.stderr) break # Click and Wait for API try: async with page.expect_response("**/search/v2/items", timeout=10000) as response_info: await next_btn.click() response = await response_info.value if response.status != 200: print(f"API Error: {response.status}", file=sys.stderr) break data = await response.json() items = data.get("data", {}).get("pageData", []) if not items: print("No more items.", file=sys.stderr) break results, stop = process_items(items) for r in results: print(json.dumps(r, ensure_ascii=False)) if stop: print("Date range satisfied. Stopping.", file=sys.stderr) break except Exception as e: print(f"Error during pagination: {e}", file=sys.stderr) break except Exception as e: print(f"Fatal Error: {e}", file=sys.stderr) await browser.close() if __name__ == "__main__": asyncio.run(run())