From 360a369d02c84bc1d6462abfab8d2f41cfc767cf Mon Sep 17 00:00:00 2001 From: along <1015042407@qq.com> Date: Sun, 11 Jan 2026 16:48:49 +0800 Subject: [PATCH] first commit --- .gitignore | 195 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 3 + crawl_json.py | 201 +++++++++++++++++++++++++++++++++++++++++++++++++ crawl_paper.py | 80 ++++++++++++++++++++ rss.py | 21 ++++++ 5 files changed, 500 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 crawl_json.py create mode 100644 crawl_paper.py create mode 100644 rss.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..91bb9bd --- /dev/null +++ b/.gitignore @@ -0,0 +1,195 @@ +# custom +papers/ +json/ +.vscode/ + +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# ---> VisualStudioCode +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b5fddbf --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# paper-crawler + +基于dblp和Semantic Scholar API爬取论文摘要。 \ No newline at end of file diff --git a/crawl_json.py b/crawl_json.py new file mode 100644 index 0000000..883ce05 --- /dev/null +++ b/crawl_json.py @@ -0,0 +1,201 @@ +import json +import requests +import time +import os +import argparse +import re + +SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract" + + +def load_json(path): + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + + +def find_hits(data): + # 尝试多种常见路径定位记录列表 + if isinstance(data, dict): + # result.hits.hit + r = data.get('result') + if r and isinstance(r, dict): + h = r.get('hits') + if h and isinstance(h, dict): + hit = h.get('hit') + if isinstance(hit, list): + return hit + # direct hits or hit + if 'hit' in data and isinstance(data['hit'], list): + return data['hit'] + if 'hits' in data and isinstance(data['hits'], list): + return data['hits'] + # fallback: if top-level is list + if isinstance(data, list): + return data + return [] + + +def extract_doi_from_string(s): + if not s or not isinstance(s, str): + return None + # common DOI patterns + m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s) + if m: + return m.group(1).rstrip('.,') + # fallback: look for doi.org/... + m2 = re.search(r'doi\.org/(.+)', s) + if m2: + return m2.group(1).rstrip('.,') + return None + + +def extract_doi(record): + # record may be dict with 'info' + info = record.get('info') if isinstance(record, dict) else None + # direct doi field + if info and isinstance(info, dict): + doi = info.get('doi') + if doi: + return doi + # ee sometimes contains doi url + ee = info.get('ee') + if ee: + if isinstance(ee, list): + for e in ee: + d = extract_doi_from_string(e) + if d: + return d + else: + d = extract_doi_from_string(ee) + if d: + return d + # scan other string fields in info + for k, v in info.items(): + if isinstance(v, str): + d = extract_doi_from_string(v) + if d: + return d + if isinstance(v, list): + for item in v: + if isinstance(item, str): + d = extract_doi_from_string(item) + if d: + return d + # record-level url + if isinstance(record, dict): + for key in ('url', 'ee'): + val = record.get(key) + if val and isinstance(val, str): + d = extract_doi_from_string(val) + if d: + return d + return None + + +def get_metadata_from_semantic(doi, retries=3): + # doi should be the bare DOI (not full url) + url = SEMANTIC_API.format(doi) + for attempt in range(1, retries + 1): + try: + resp = requests.get(url, timeout=10) + if resp.status_code == 200: + return resp.json() + if resp.status_code == 429: + # rate limited: back off + time.sleep(1.0 * attempt) + else: + # other non-200 -> break early for most cases + return None + except requests.RequestException: + time.sleep(0.5 * attempt) + return None + + +def write_markdown_entry(f, paper): + title = paper.get('title') or paper.get('paperTitle') or 'No Title' + f.write(f"## {title}\n\n") + abstract = paper.get('abstract', '') + if abstract: + f.write("**Abstract**:\n\n") + f.write(abstract.strip() + "\n\n") + else: + f.write("_No abstract available._\n\n") + f.write("\n---\n\n") + + +def main(): + parser = argparse.ArgumentParser( + description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts') + parser.add_argument('input_json', help='Input JSON file (DBLP-like)') + parser.add_argument('output_md', help='Output markdown file') + parser.add_argument('--years', default='2024,2025', + help='Comma-separated years to include (default: 2024,2025)') + parser.add_argument('--sleep', type=float, default=0.1, + help='Sleep seconds between API calls (default 0.1)') + args = parser.parse_args() + + years = set() + for y in args.years.split(','): + y = y.strip() + if y: + try: + years.add(int(y)) + except ValueError: + pass + + data = load_json(args.input_json) + hits = find_hits(data) + print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}') + + os.makedirs(os.path.dirname(args.output_md) if os.path.dirname( + args.output_md) else '.', exist_ok=True) + + seen_dois = set() + with open(args.output_md, 'w', encoding='utf-8') as out: + count = 0 + for rec in hits: + info = rec.get('info') if isinstance(rec, dict) else None + year = None + if info and isinstance(info, dict): + y = info.get('year') + if isinstance(y, str) and y.isdigit(): + year = int(y) + elif isinstance(y, int): + year = y + if year not in years: + continue + + doi = extract_doi(rec) + if not doi: + # write a minimal entry using title if no DOI + title = info.get('title') if info else rec.get( + 'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}') + out.write( + f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n") + continue + + # normalize doi (strip url prefix if present) + doi = doi.split('doi.org/')[-1] + doi = doi.strip() + if doi in seen_dois: + continue + seen_dois.add(doi) + + count += 1 + print(f'[{count}] Fetching DOI: {doi} (year={year})') + meta = get_metadata_from_semantic(doi) + if meta: + write_markdown_entry(out, meta) + else: + # fallback: write title from info + title = info.get('title') if info else f'DOI: {doi}' + out.write( + f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n") + + time.sleep(args.sleep) + + print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}') + + +if __name__ == '__main__': + main() diff --git a/crawl_paper.py b/crawl_paper.py new file mode 100644 index 0000000..222c245 --- /dev/null +++ b/crawl_paper.py @@ -0,0 +1,80 @@ +import requests +from bs4 import BeautifulSoup +import time +import os +import argparse + +# Semantic Scholar API URL(无需API Key) +SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract" + + +def get_dois_from_dblp(dblp_url): + resp = requests.get(dblp_url) + soup = BeautifulSoup(resp.text, "html.parser") + doi_links = [] + seen = set() + + for article in soup.find_all("li", class_="entry"): + for link in article.find_all("a", href=True): + href = link['href'] + if "doi.org" in href and "bsky.app" not in href: + if href.startswith("https://doi.org/"): + doi = href.strip() + if doi not in seen: + seen.add(doi) + doi_links.append(doi) + + return doi_links # 保持原顺序 + + +def get_metadata_from_semantic(doi): + url = SEMANTIC_API.format(doi.split("doi.org/")[-1]) + try: + resp = requests.get(url, timeout=10) + if resp.status_code == 200: + return resp.json() + except Exception as e: + print(f"Error fetching {doi}: {e}") + return None + + +def write_markdown_entry(f, paper): + f.write(f"## {paper.get('title', 'No Title')}\n\n") + abstract = paper.get("abstract", "") + if abstract: + f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n") + else: + f.write(f"_No abstract available._\n\n") + f.write("\n---\n\n") + + +def main(): + parser = argparse.ArgumentParser( + description='Crawl paper abstracts from dblp and Semantic Scholar') + parser.add_argument('dblp_url', help='DBLP conference URL') + parser.add_argument('output_file', help='Output markdown file path') + + args = parser.parse_args() + + # 确保输出目录存在 + os.makedirs(os.path.dirname(args.output_file) if os.path.dirname( + args.output_file) else '.', exist_ok=True) + + dois = get_dois_from_dblp(args.dblp_url) + print(f"Found {len(dois)} DOIs.") + + with open(args.output_file, "w", encoding="utf-8") as f: + for i, doi in enumerate(dois, 1): + print(f"[{i}/{len(dois)}] Fetching metadata for {doi}") + metadata = get_metadata_from_semantic(doi) + if metadata: + write_markdown_entry(f, metadata) + else: + write_markdown_entry(f, {"title": f"DOI: {doi}"}) + time.sleep(0.1) # 避免过快请求被封 + + print(f"\nDone. Saved to: {args.output_file}") + + +if __name__ == "__main__": + main() diff --git a/rss.py b/rss.py new file mode 100644 index 0000000..36d7ed8 --- /dev/null +++ b/rss.py @@ -0,0 +1,21 @@ +import feedparser + +# 示例 RSS 源 +rss_url = "https://rss.cnki.net/knavi/rss/RJXB?pcode=CJFD,CCJD" + +# 解析 RSS +feed = feedparser.parse(rss_url) + +# 检查是否解析成功 +if feed.bozo: + print("警告:RSS 解析可能有问题") + +# 输出 Feed 标题 +print("Feed 标题:", feed.feed.get('title', '无标题')) + +# 输出前5条条目 +for entry in feed.entries: + print("标题:", entry.title) + print("链接:", entry.link) + print("发布时间:", entry.get('published', '无时间')) + print("-" * 40)