import json import requests import time import os import argparse import re SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract" def load_json(path): with open(path, 'r', encoding='utf-8') as f: return json.load(f) def find_hits(data): # 尝试多种常见路径定位记录列表 if isinstance(data, dict): # result.hits.hit r = data.get('result') if r and isinstance(r, dict): h = r.get('hits') if h and isinstance(h, dict): hit = h.get('hit') if isinstance(hit, list): return hit # direct hits or hit if 'hit' in data and isinstance(data['hit'], list): return data['hit'] if 'hits' in data and isinstance(data['hits'], list): return data['hits'] # fallback: if top-level is list if isinstance(data, list): return data return [] def extract_doi_from_string(s): if not s or not isinstance(s, str): return None # common DOI patterns m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s) if m: return m.group(1).rstrip('.,') # fallback: look for doi.org/... m2 = re.search(r'doi\.org/(.+)', s) if m2: return m2.group(1).rstrip('.,') return None def extract_doi(record): # record may be dict with 'info' info = record.get('info') if isinstance(record, dict) else None # direct doi field if info and isinstance(info, dict): doi = info.get('doi') if doi: return doi # ee sometimes contains doi url ee = info.get('ee') if ee: if isinstance(ee, list): for e in ee: d = extract_doi_from_string(e) if d: return d else: d = extract_doi_from_string(ee) if d: return d # scan other string fields in info for k, v in info.items(): if isinstance(v, str): d = extract_doi_from_string(v) if d: return d if isinstance(v, list): for item in v: if isinstance(item, str): d = extract_doi_from_string(item) if d: return d # record-level url if isinstance(record, dict): for key in ('url', 'ee'): val = record.get(key) if val and isinstance(val, str): d = extract_doi_from_string(val) if d: return d return None def get_metadata_from_semantic(doi, retries=3): # doi should be the bare DOI (not full url) url = SEMANTIC_API.format(doi) for attempt in range(1, retries + 1): try: resp = requests.get(url, timeout=10) if resp.status_code == 200: return resp.json() if resp.status_code == 429: # rate limited: back off time.sleep(1.0 * attempt) else: # other non-200 -> break early for most cases return None except requests.RequestException: time.sleep(0.5 * attempt) return None def write_markdown_entry(f, paper): title = paper.get('title') or paper.get('paperTitle') or 'No Title' f.write(f"## {title}\n\n") abstract = paper.get('abstract', '') if abstract: f.write("**Abstract**:\n\n") f.write(abstract.strip() + "\n\n") else: f.write("_No abstract available._\n\n") f.write("\n---\n\n") def main(): parser = argparse.ArgumentParser( description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts') parser.add_argument('input_json', help='Input JSON file (DBLP-like)') parser.add_argument('output_md', help='Output markdown file') parser.add_argument('--years', default='2024,2025', help='Comma-separated years to include (default: 2024,2025)') parser.add_argument('--sleep', type=float, default=0.1, help='Sleep seconds between API calls (default 0.1)') args = parser.parse_args() years = set() for y in args.years.split(','): y = y.strip() if y: try: years.add(int(y)) except ValueError: pass data = load_json(args.input_json) hits = find_hits(data) print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}') os.makedirs(os.path.dirname(args.output_md) if os.path.dirname( args.output_md) else '.', exist_ok=True) seen_dois = set() with open(args.output_md, 'w', encoding='utf-8') as out: count = 0 for rec in hits: info = rec.get('info') if isinstance(rec, dict) else None year = None if info and isinstance(info, dict): y = info.get('year') if isinstance(y, str) and y.isdigit(): year = int(y) elif isinstance(y, int): year = y if year not in years: continue doi = extract_doi(rec) if not doi: # write a minimal entry using title if no DOI title = info.get('title') if info else rec.get( 'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}') out.write( f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n") continue # normalize doi (strip url prefix if present) doi = doi.split('doi.org/')[-1] doi = doi.strip() if doi in seen_dois: continue seen_dois.add(doi) count += 1 print(f'[{count}] Fetching DOI: {doi} (year={year})') meta = get_metadata_from_semantic(doi) if meta: write_markdown_entry(out, meta) else: # fallback: write title from info title = info.get('title') if info else f'DOI: {doi}' out.write( f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n") time.sleep(args.sleep) print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}') if __name__ == '__main__': main()