202 lines
6.4 KiB
Python
202 lines
6.4 KiB
Python
import json
|
|
import requests
|
|
import time
|
|
import os
|
|
import argparse
|
|
import re
|
|
|
|
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
|
|
|
|
|
|
def load_json(path):
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def find_hits(data):
|
|
# 尝试多种常见路径定位记录列表
|
|
if isinstance(data, dict):
|
|
# result.hits.hit
|
|
r = data.get('result')
|
|
if r and isinstance(r, dict):
|
|
h = r.get('hits')
|
|
if h and isinstance(h, dict):
|
|
hit = h.get('hit')
|
|
if isinstance(hit, list):
|
|
return hit
|
|
# direct hits or hit
|
|
if 'hit' in data and isinstance(data['hit'], list):
|
|
return data['hit']
|
|
if 'hits' in data and isinstance(data['hits'], list):
|
|
return data['hits']
|
|
# fallback: if top-level is list
|
|
if isinstance(data, list):
|
|
return data
|
|
return []
|
|
|
|
|
|
def extract_doi_from_string(s):
|
|
if not s or not isinstance(s, str):
|
|
return None
|
|
# common DOI patterns
|
|
m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s)
|
|
if m:
|
|
return m.group(1).rstrip('.,')
|
|
# fallback: look for doi.org/...
|
|
m2 = re.search(r'doi\.org/(.+)', s)
|
|
if m2:
|
|
return m2.group(1).rstrip('.,')
|
|
return None
|
|
|
|
|
|
def extract_doi(record):
|
|
# record may be dict with 'info'
|
|
info = record.get('info') if isinstance(record, dict) else None
|
|
# direct doi field
|
|
if info and isinstance(info, dict):
|
|
doi = info.get('doi')
|
|
if doi:
|
|
return doi
|
|
# ee sometimes contains doi url
|
|
ee = info.get('ee')
|
|
if ee:
|
|
if isinstance(ee, list):
|
|
for e in ee:
|
|
d = extract_doi_from_string(e)
|
|
if d:
|
|
return d
|
|
else:
|
|
d = extract_doi_from_string(ee)
|
|
if d:
|
|
return d
|
|
# scan other string fields in info
|
|
for k, v in info.items():
|
|
if isinstance(v, str):
|
|
d = extract_doi_from_string(v)
|
|
if d:
|
|
return d
|
|
if isinstance(v, list):
|
|
for item in v:
|
|
if isinstance(item, str):
|
|
d = extract_doi_from_string(item)
|
|
if d:
|
|
return d
|
|
# record-level url
|
|
if isinstance(record, dict):
|
|
for key in ('url', 'ee'):
|
|
val = record.get(key)
|
|
if val and isinstance(val, str):
|
|
d = extract_doi_from_string(val)
|
|
if d:
|
|
return d
|
|
return None
|
|
|
|
|
|
def get_metadata_from_semantic(doi, retries=3):
|
|
# doi should be the bare DOI (not full url)
|
|
url = SEMANTIC_API.format(doi)
|
|
for attempt in range(1, retries + 1):
|
|
try:
|
|
resp = requests.get(url, timeout=10)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
if resp.status_code == 429:
|
|
# rate limited: back off
|
|
time.sleep(1.0 * attempt)
|
|
else:
|
|
# other non-200 -> break early for most cases
|
|
return None
|
|
except requests.RequestException:
|
|
time.sleep(0.5 * attempt)
|
|
return None
|
|
|
|
|
|
def write_markdown_entry(f, paper):
|
|
title = paper.get('title') or paper.get('paperTitle') or 'No Title'
|
|
f.write(f"## {title}\n\n")
|
|
abstract = paper.get('abstract', '')
|
|
if abstract:
|
|
f.write("**Abstract**:\n\n")
|
|
f.write(abstract.strip() + "\n\n")
|
|
else:
|
|
f.write("_No abstract available._\n\n")
|
|
f.write("\n---\n\n")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts')
|
|
parser.add_argument('input_json', help='Input JSON file (DBLP-like)')
|
|
parser.add_argument('output_md', help='Output markdown file')
|
|
parser.add_argument('--years', default='2024,2025',
|
|
help='Comma-separated years to include (default: 2024,2025)')
|
|
parser.add_argument('--sleep', type=float, default=0.1,
|
|
help='Sleep seconds between API calls (default 0.1)')
|
|
args = parser.parse_args()
|
|
|
|
years = set()
|
|
for y in args.years.split(','):
|
|
y = y.strip()
|
|
if y:
|
|
try:
|
|
years.add(int(y))
|
|
except ValueError:
|
|
pass
|
|
|
|
data = load_json(args.input_json)
|
|
hits = find_hits(data)
|
|
print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}')
|
|
|
|
os.makedirs(os.path.dirname(args.output_md) if os.path.dirname(
|
|
args.output_md) else '.', exist_ok=True)
|
|
|
|
seen_dois = set()
|
|
with open(args.output_md, 'w', encoding='utf-8') as out:
|
|
count = 0
|
|
for rec in hits:
|
|
info = rec.get('info') if isinstance(rec, dict) else None
|
|
year = None
|
|
if info and isinstance(info, dict):
|
|
y = info.get('year')
|
|
if isinstance(y, str) and y.isdigit():
|
|
year = int(y)
|
|
elif isinstance(y, int):
|
|
year = y
|
|
if year not in years:
|
|
continue
|
|
|
|
doi = extract_doi(rec)
|
|
if not doi:
|
|
# write a minimal entry using title if no DOI
|
|
title = info.get('title') if info else rec.get(
|
|
'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}')
|
|
out.write(
|
|
f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n")
|
|
continue
|
|
|
|
# normalize doi (strip url prefix if present)
|
|
doi = doi.split('doi.org/')[-1]
|
|
doi = doi.strip()
|
|
if doi in seen_dois:
|
|
continue
|
|
seen_dois.add(doi)
|
|
|
|
count += 1
|
|
print(f'[{count}] Fetching DOI: {doi} (year={year})')
|
|
meta = get_metadata_from_semantic(doi)
|
|
if meta:
|
|
write_markdown_entry(out, meta)
|
|
else:
|
|
# fallback: write title from info
|
|
title = info.get('title') if info else f'DOI: {doi}'
|
|
out.write(
|
|
f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n")
|
|
|
|
time.sleep(args.sleep)
|
|
|
|
print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|