Files
paper-crawler/crawl_paper.py
2026-01-14 16:58:07 +08:00

81 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
from bs4 import BeautifulSoup
import time
import os
import argparse
# Semantic Scholar API URL无需API Key
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
def get_dois_from_dblp(dblp_url):
resp = requests.get(dblp_url)
soup = BeautifulSoup(resp.text, "html.parser")
doi_links = []
seen = set()
for article in soup.find_all("li", class_="entry"):
for link in article.find_all("a", href=True):
href = link['href']
if "doi.org" in href and "bsky.app" not in href:
if href.startswith("https://doi.org/"):
doi = href.strip()
if doi not in seen:
seen.add(doi)
doi_links.append(doi)
return doi_links # 保持原顺序
def get_metadata_from_semantic(doi):
url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
return resp.json()
except Exception as e:
print(f"Error fetching {doi}: {e}")
return None
def write_markdown_entry(f, paper):
f.write(f"## {paper.get('title', 'No Title')}\n\n")
abstract = paper.get("abstract", "")
if abstract:
f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
else:
f.write(f"_No abstract available._\n\n")
f.write("\n---\n\n")
def main():
parser = argparse.ArgumentParser(
description='Crawl paper abstracts from dblp and Semantic Scholar')
parser.add_argument('dblp_url', help='DBLP conference URL')
parser.add_argument('output_file', help='Output markdown file path')
args = parser.parse_args()
# 确保输出目录存在
os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
args.output_file) else '.', exist_ok=True)
dois = get_dois_from_dblp(args.dblp_url)
print(f"Found {len(dois)} DOIs.")
with open(args.output_file, "w", encoding="utf-8") as f:
for i, doi in enumerate(dois, 1):
print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
metadata = get_metadata_from_semantic(doi)
if metadata:
write_markdown_entry(f, metadata)
else:
write_markdown_entry(f, {"title": f"DOI: {doi}"})
time.sleep(1) # 避免过快请求被封
print(f"\nDone. Saved to: {args.output_file}")
if __name__ == "__main__":
main()