From 360a369d02c84bc1d6462abfab8d2f41cfc767cf Mon Sep 17 00:00:00 2001
From: along <1015042407@qq.com>
Date: Sun, 11 Jan 2026 16:48:49 +0800
Subject: [PATCH] first commit

---
 .gitignore     | 195 +++++++++++++++++++++++++++++++++++++++++++++++
 README.md      |   3 +
 crawl_json.py  | 201 +++++++++++++++++++++++++++++++++++++++++++++++++
 crawl_paper.py |  80 ++++++++++++++++++++
 rss.py         |  21 ++++++
 5 files changed, 500 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 crawl_json.py
 create mode 100644 crawl_paper.py
 create mode 100644 rss.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..91bb9bd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,195 @@
+# custom
+papers/
+json/
+.vscode/
+
+# ---> Python
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# ---> VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b5fddbf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# paper-crawler
+
+基于dblp和Semantic Scholar API爬取论文摘要。
\ No newline at end of file
diff --git a/crawl_json.py b/crawl_json.py
new file mode 100644
index 0000000..883ce05
--- /dev/null
+++ b/crawl_json.py
@@ -0,0 +1,201 @@
+import json
+import requests
+import time
+import os
+import argparse
+import re
+
+SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
+
+
+def load_json(path):
+    with open(path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def find_hits(data):
+    # 尝试多种常见路径定位记录列表
+    if isinstance(data, dict):
+        # result.hits.hit
+        r = data.get('result')
+        if r and isinstance(r, dict):
+            h = r.get('hits')
+            if h and isinstance(h, dict):
+                hit = h.get('hit')
+                if isinstance(hit, list):
+                    return hit
+        # direct hits or hit
+        if 'hit' in data and isinstance(data['hit'], list):
+            return data['hit']
+        if 'hits' in data and isinstance(data['hits'], list):
+            return data['hits']
+    # fallback: if top-level is list
+    if isinstance(data, list):
+        return data
+    return []
+
+
+def extract_doi_from_string(s):
+    if not s or not isinstance(s, str):
+        return None
+    # common DOI patterns
+    m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s)
+    if m:
+        return m.group(1).rstrip('.,')
+    # fallback: look for doi.org/...
+    m2 = re.search(r'doi\.org/(.+)', s)
+    if m2:
+        return m2.group(1).rstrip('.,')
+    return None
+
+
+def extract_doi(record):
+    # record may be dict with 'info'
+    info = record.get('info') if isinstance(record, dict) else None
+    # direct doi field
+    if info and isinstance(info, dict):
+        doi = info.get('doi')
+        if doi:
+            return doi
+        # ee sometimes contains doi url
+        ee = info.get('ee')
+        if ee:
+            if isinstance(ee, list):
+                for e in ee:
+                    d = extract_doi_from_string(e)
+                    if d:
+                        return d
+            else:
+                d = extract_doi_from_string(ee)
+                if d:
+                    return d
+        # scan other string fields in info
+        for k, v in info.items():
+            if isinstance(v, str):
+                d = extract_doi_from_string(v)
+                if d:
+                    return d
+            if isinstance(v, list):
+                for item in v:
+                    if isinstance(item, str):
+                        d = extract_doi_from_string(item)
+                        if d:
+                            return d
+    # record-level url
+    if isinstance(record, dict):
+        for key in ('url', 'ee'):
+            val = record.get(key)
+            if val and isinstance(val, str):
+                d = extract_doi_from_string(val)
+                if d:
+                    return d
+    return None
+
+
+def get_metadata_from_semantic(doi, retries=3):
+    # doi should be the bare DOI (not full url)
+    url = SEMANTIC_API.format(doi)
+    for attempt in range(1, retries + 1):
+        try:
+            resp = requests.get(url, timeout=10)
+            if resp.status_code == 200:
+                return resp.json()
+            if resp.status_code == 429:
+                # rate limited: back off
+                time.sleep(1.0 * attempt)
+            else:
+                # other non-200 -> break early for most cases
+                return None
+        except requests.RequestException:
+            time.sleep(0.5 * attempt)
+    return None
+
+
+def write_markdown_entry(f, paper):
+    title = paper.get('title') or paper.get('paperTitle') or 'No Title'
+    f.write(f"## {title}\n\n")
+    abstract = paper.get('abstract', '')
+    if abstract:
+        f.write("**Abstract**:\n\n")
+        f.write(abstract.strip() + "\n\n")
+    else:
+        f.write("_No abstract available._\n\n")
+    f.write("\n---\n\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts')
+    parser.add_argument('input_json', help='Input JSON file (DBLP-like)')
+    parser.add_argument('output_md', help='Output markdown file')
+    parser.add_argument('--years', default='2024,2025',
+                        help='Comma-separated years to include (default: 2024,2025)')
+    parser.add_argument('--sleep', type=float, default=0.1,
+                        help='Sleep seconds between API calls (default 0.1)')
+    args = parser.parse_args()
+
+    years = set()
+    for y in args.years.split(','):
+        y = y.strip()
+        if y:
+            try:
+                years.add(int(y))
+            except ValueError:
+                pass
+
+    data = load_json(args.input_json)
+    hits = find_hits(data)
+    print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}')
+
+    os.makedirs(os.path.dirname(args.output_md) if os.path.dirname(
+        args.output_md) else '.', exist_ok=True)
+
+    seen_dois = set()
+    with open(args.output_md, 'w', encoding='utf-8') as out:
+        count = 0
+        for rec in hits:
+            info = rec.get('info') if isinstance(rec, dict) else None
+            year = None
+            if info and isinstance(info, dict):
+                y = info.get('year')
+                if isinstance(y, str) and y.isdigit():
+                    year = int(y)
+                elif isinstance(y, int):
+                    year = y
+            if year not in years:
+                continue
+
+            doi = extract_doi(rec)
+            if not doi:
+                # write a minimal entry using title if no DOI
+                title = info.get('title') if info else rec.get(
+                    'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}')
+                out.write(
+                    f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n")
+                continue
+
+            # normalize doi (strip url prefix if present)
+            doi = doi.split('doi.org/')[-1]
+            doi = doi.strip()
+            if doi in seen_dois:
+                continue
+            seen_dois.add(doi)
+
+            count += 1
+            print(f'[{count}] Fetching DOI: {doi} (year={year})')
+            meta = get_metadata_from_semantic(doi)
+            if meta:
+                write_markdown_entry(out, meta)
+            else:
+                # fallback: write title from info
+                title = info.get('title') if info else f'DOI: {doi}'
+                out.write(
+                    f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n")
+
+            time.sleep(args.sleep)
+
+    print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/crawl_paper.py b/crawl_paper.py
new file mode 100644
index 0000000..222c245
--- /dev/null
+++ b/crawl_paper.py
@@ -0,0 +1,80 @@
+import requests
+from bs4 import BeautifulSoup
+import time
+import os
+import argparse
+
+# Semantic Scholar API URL（无需API Key）
+SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
+
+
+def get_dois_from_dblp(dblp_url):
+    resp = requests.get(dblp_url)
+    soup = BeautifulSoup(resp.text, "html.parser")
+    doi_links = []
+    seen = set()
+
+    for article in soup.find_all("li", class_="entry"):
+        for link in article.find_all("a", href=True):
+            href = link['href']
+            if "doi.org" in href and "bsky.app" not in href:
+                if href.startswith("https://doi.org/"):
+                    doi = href.strip()
+                    if doi not in seen:
+                        seen.add(doi)
+                        doi_links.append(doi)
+
+    return doi_links  # 保持原顺序
+
+
+def get_metadata_from_semantic(doi):
+    url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
+    try:
+        resp = requests.get(url, timeout=10)
+        if resp.status_code == 200:
+            return resp.json()
+    except Exception as e:
+        print(f"Error fetching {doi}: {e}")
+    return None
+
+
+def write_markdown_entry(f, paper):
+    f.write(f"## {paper.get('title', 'No Title')}\n\n")
+    abstract = paper.get("abstract", "")
+    if abstract:
+        f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
+    else:
+        f.write(f"_No abstract available._\n\n")
+    f.write("\n---\n\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Crawl paper abstracts from dblp and Semantic Scholar')
+    parser.add_argument('dblp_url', help='DBLP conference URL')
+    parser.add_argument('output_file', help='Output markdown file path')
+
+    args = parser.parse_args()
+
+    # 确保输出目录存在
+    os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
+        args.output_file) else '.', exist_ok=True)
+
+    dois = get_dois_from_dblp(args.dblp_url)
+    print(f"Found {len(dois)} DOIs.")
+
+    with open(args.output_file, "w", encoding="utf-8") as f:
+        for i, doi in enumerate(dois, 1):
+            print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
+            metadata = get_metadata_from_semantic(doi)
+            if metadata:
+                write_markdown_entry(f, metadata)
+            else:
+                write_markdown_entry(f, {"title": f"DOI: {doi}"})
+            time.sleep(0.1)  # 避免过快请求被封
+
+    print(f"\nDone. Saved to: {args.output_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rss.py b/rss.py
new file mode 100644
index 0000000..36d7ed8
--- /dev/null
+++ b/rss.py
@@ -0,0 +1,21 @@
+import feedparser
+
+# 示例 RSS 源
+rss_url = "https://rss.cnki.net/knavi/rss/RJXB?pcode=CJFD,CCJD"
+
+# 解析 RSS
+feed = feedparser.parse(rss_url)
+
+# 检查是否解析成功
+if feed.bozo:
+    print("警告：RSS 解析可能有问题")
+
+# 输出 Feed 标题
+print("Feed 标题：", feed.feed.get('title', '无标题'))
+
+# 输出前5条条目
+for entry in feed.entries:
+    print("标题：", entry.title)
+    print("链接：", entry.link)
+    print("发布时间：", entry.get('published', '无时间'))
+    print("-" * 40)