import requests from bs4 import BeautifulSoup import time import os import argparse # Semantic Scholar API URL(无需API Key) SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract" def get_dois_from_dblp(dblp_url): resp = requests.get(dblp_url) soup = BeautifulSoup(resp.text, "html.parser") doi_links = [] seen = set() for article in soup.find_all("li", class_="entry"): for link in article.find_all("a", href=True): href = link['href'] if "doi.org" in href and "bsky.app" not in href: if href.startswith("https://doi.org/"): doi = href.strip() if doi not in seen: seen.add(doi) doi_links.append(doi) return doi_links # 保持原顺序 def get_metadata_from_semantic(doi): url = SEMANTIC_API.format(doi.split("doi.org/")[-1]) try: resp = requests.get(url, timeout=10) if resp.status_code == 200: return resp.json() except Exception as e: print(f"Error fetching {doi}: {e}") return None def write_markdown_entry(f, paper): f.write(f"## {paper.get('title', 'No Title')}\n\n") abstract = paper.get("abstract", "") if abstract: f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n") else: f.write(f"_No abstract available._\n\n") f.write("\n---\n\n") def main(): parser = argparse.ArgumentParser( description='Crawl paper abstracts from dblp and Semantic Scholar') parser.add_argument('dblp_url', help='DBLP conference URL') parser.add_argument('output_file', help='Output markdown file path') args = parser.parse_args() # 确保输出目录存在 os.makedirs(os.path.dirname(args.output_file) if os.path.dirname( args.output_file) else '.', exist_ok=True) dois = get_dois_from_dblp(args.dblp_url) print(f"Found {len(dois)} DOIs.") with open(args.output_file, "w", encoding="utf-8") as f: for i, doi in enumerate(dois, 1): print(f"[{i}/{len(dois)}] Fetching metadata for {doi}") metadata = get_metadata_from_semantic(doi) if metadata: write_markdown_entry(f, metadata) else: write_markdown_entry(f, {"title": f"DOI: {doi}"}) time.sleep(0.1) # 避免过快请求被封 print(f"\nDone. Saved to: {args.output_file}") if __name__ == "__main__": main()