81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
import requests
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
import os
|
||
import argparse
|
||
|
||
# Semantic Scholar API URL(无需API Key)
|
||
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
|
||
|
||
|
||
def get_dois_from_dblp(dblp_url):
|
||
resp = requests.get(dblp_url)
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
doi_links = []
|
||
seen = set()
|
||
|
||
for article in soup.find_all("li", class_="entry"):
|
||
for link in article.find_all("a", href=True):
|
||
href = link['href']
|
||
if "doi.org" in href and "bsky.app" not in href:
|
||
if href.startswith("https://doi.org/"):
|
||
doi = href.strip()
|
||
if doi not in seen:
|
||
seen.add(doi)
|
||
doi_links.append(doi)
|
||
|
||
return doi_links # 保持原顺序
|
||
|
||
|
||
def get_metadata_from_semantic(doi):
|
||
url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
|
||
try:
|
||
resp = requests.get(url, timeout=10)
|
||
if resp.status_code == 200:
|
||
return resp.json()
|
||
except Exception as e:
|
||
print(f"Error fetching {doi}: {e}")
|
||
return None
|
||
|
||
|
||
def write_markdown_entry(f, paper):
|
||
f.write(f"## {paper.get('title', 'No Title')}\n\n")
|
||
abstract = paper.get("abstract", "")
|
||
if abstract:
|
||
f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
|
||
else:
|
||
f.write(f"_No abstract available._\n\n")
|
||
f.write("\n---\n\n")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description='Crawl paper abstracts from dblp and Semantic Scholar')
|
||
parser.add_argument('dblp_url', help='DBLP conference URL')
|
||
parser.add_argument('output_file', help='Output markdown file path')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 确保输出目录存在
|
||
os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
|
||
args.output_file) else '.', exist_ok=True)
|
||
|
||
dois = get_dois_from_dblp(args.dblp_url)
|
||
print(f"Found {len(dois)} DOIs.")
|
||
|
||
with open(args.output_file, "w", encoding="utf-8") as f:
|
||
for i, doi in enumerate(dois, 1):
|
||
print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
|
||
metadata = get_metadata_from_semantic(doi)
|
||
if metadata:
|
||
write_markdown_entry(f, metadata)
|
||
else:
|
||
write_markdown_entry(f, {"title": f"DOI: {doi}"})
|
||
time.sleep(1) # 避免过快请求被封
|
||
|
||
print(f"\nDone. Saved to: {args.output_file}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|