81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
|
|
import requests
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
import time
|
|||
|
|
import os
|
|||
|
|
import argparse
|
|||
|
|
|
|||
|
|
# Semantic Scholar API URL(无需API Key)
|
|||
|
|
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_dois_from_dblp(dblp_url):
|
|||
|
|
resp = requests.get(dblp_url)
|
|||
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|||
|
|
doi_links = []
|
|||
|
|
seen = set()
|
|||
|
|
|
|||
|
|
for article in soup.find_all("li", class_="entry"):
|
|||
|
|
for link in article.find_all("a", href=True):
|
|||
|
|
href = link['href']
|
|||
|
|
if "doi.org" in href and "bsky.app" not in href:
|
|||
|
|
if href.startswith("https://doi.org/"):
|
|||
|
|
doi = href.strip()
|
|||
|
|
if doi not in seen:
|
|||
|
|
seen.add(doi)
|
|||
|
|
doi_links.append(doi)
|
|||
|
|
|
|||
|
|
return doi_links # 保持原顺序
|
|||
|
|
|
|||
|
|
|
|||
|
|
def get_metadata_from_semantic(doi):
|
|||
|
|
url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
|
|||
|
|
try:
|
|||
|
|
resp = requests.get(url, timeout=10)
|
|||
|
|
if resp.status_code == 200:
|
|||
|
|
return resp.json()
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"Error fetching {doi}: {e}")
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def write_markdown_entry(f, paper):
|
|||
|
|
f.write(f"## {paper.get('title', 'No Title')}\n\n")
|
|||
|
|
abstract = paper.get("abstract", "")
|
|||
|
|
if abstract:
|
|||
|
|
f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
|
|||
|
|
else:
|
|||
|
|
f.write(f"_No abstract available._\n\n")
|
|||
|
|
f.write("\n---\n\n")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description='Crawl paper abstracts from dblp and Semantic Scholar')
|
|||
|
|
parser.add_argument('dblp_url', help='DBLP conference URL')
|
|||
|
|
parser.add_argument('output_file', help='Output markdown file path')
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
# 确保输出目录存在
|
|||
|
|
os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
|
|||
|
|
args.output_file) else '.', exist_ok=True)
|
|||
|
|
|
|||
|
|
dois = get_dois_from_dblp(args.dblp_url)
|
|||
|
|
print(f"Found {len(dois)} DOIs.")
|
|||
|
|
|
|||
|
|
with open(args.output_file, "w", encoding="utf-8") as f:
|
|||
|
|
for i, doi in enumerate(dois, 1):
|
|||
|
|
print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
|
|||
|
|
metadata = get_metadata_from_semantic(doi)
|
|||
|
|
if metadata:
|
|||
|
|
write_markdown_entry(f, metadata)
|
|||
|
|
else:
|
|||
|
|
write_markdown_entry(f, {"title": f"DOI: {doi}"})
|
|||
|
|
time.sleep(0.1) # 避免过快请求被封
|
|||
|
|
|
|||
|
|
print(f"\nDone. Saved to: {args.output_file}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|