Files
paper-crawler/crawl_paper.py

81 lines
2.5 KiB
Python
Raw Normal View History

2026-01-11 16:48:49 +08:00
import requests
from bs4 import BeautifulSoup
import time
import os
import argparse
# Semantic Scholar API URL无需API Key
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
def get_dois_from_dblp(dblp_url):
resp = requests.get(dblp_url)
soup = BeautifulSoup(resp.text, "html.parser")
doi_links = []
seen = set()
for article in soup.find_all("li", class_="entry"):
for link in article.find_all("a", href=True):
href = link['href']
if "doi.org" in href and "bsky.app" not in href:
if href.startswith("https://doi.org/"):
doi = href.strip()
if doi not in seen:
seen.add(doi)
doi_links.append(doi)
return doi_links # 保持原顺序
def get_metadata_from_semantic(doi):
url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
return resp.json()
except Exception as e:
print(f"Error fetching {doi}: {e}")
return None
def write_markdown_entry(f, paper):
f.write(f"## {paper.get('title', 'No Title')}\n\n")
abstract = paper.get("abstract", "")
if abstract:
f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
else:
f.write(f"_No abstract available._\n\n")
f.write("\n---\n\n")
def main():
parser = argparse.ArgumentParser(
description='Crawl paper abstracts from dblp and Semantic Scholar')
parser.add_argument('dblp_url', help='DBLP conference URL')
parser.add_argument('output_file', help='Output markdown file path')
args = parser.parse_args()
# 确保输出目录存在
os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
args.output_file) else '.', exist_ok=True)
dois = get_dois_from_dblp(args.dblp_url)
print(f"Found {len(dois)} DOIs.")
with open(args.output_file, "w", encoding="utf-8") as f:
for i, doi in enumerate(dois, 1):
print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
metadata = get_metadata_from_semantic(doi)
if metadata:
write_markdown_entry(f, metadata)
else:
write_markdown_entry(f, {"title": f"DOI: {doi}"})
time.sleep(0.1) # 避免过快请求被封
print(f"\nDone. Saved to: {args.output_file}")
if __name__ == "__main__":
main()