first commit

2026-01-11 16:48:49 +08:00
commit 360a369d02
5 changed files with 500 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,195 @@
 # custom
 papers/
 json/
 .vscode/
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # Ruff stuff:
 .ruff_cache/
 # PyPI configuration file
 .pypirc
 # ---> VisualStudioCode
 .vscode/*
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 !.vscode/*.code-snippets
 # Local History for Visual Studio Code
 .history/
 # Built Visual Studio Code Extensions
 *.vsix
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 # paper-crawler
 基于dblp和Semantic Scholar API爬取论文摘要。
--- a/crawl_json.py
+++ b/crawl_json.py
@@ -0,0 +1,201 @@
 import json
 import requests
 import time
 import os
 import argparse
 import re
 SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
 def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)
 def find_hits(data):
    # 尝试多种常见路径定位记录列表
    if isinstance(data, dict):
        # result.hits.hit
        r = data.get('result')
        if r and isinstance(r, dict):
            h = r.get('hits')
            if h and isinstance(h, dict):
                hit = h.get('hit')
                if isinstance(hit, list):
                    return hit
        # direct hits or hit
        if 'hit' in data and isinstance(data['hit'], list):
            return data['hit']
        if 'hits' in data and isinstance(data['hits'], list):
            return data['hits']
    # fallback: if top-level is list
    if isinstance(data, list):
        return data
    return []
 def extract_doi_from_string(s):
    if not s or not isinstance(s, str):
        return None
    # common DOI patterns
    m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s)
    if m:
        return m.group(1).rstrip('.,')
    # fallback: look for doi.org/...
    m2 = re.search(r'doi\.org/(.+)', s)
    if m2:
        return m2.group(1).rstrip('.,')
    return None
 def extract_doi(record):
    # record may be dict with 'info'
    info = record.get('info') if isinstance(record, dict) else None
    # direct doi field
    if info and isinstance(info, dict):
        doi = info.get('doi')
        if doi:
            return doi
        # ee sometimes contains doi url
        ee = info.get('ee')
        if ee:
            if isinstance(ee, list):
                for e in ee:
                    d = extract_doi_from_string(e)
                    if d:
                        return d
            else:
                d = extract_doi_from_string(ee)
                if d:
                    return d
        # scan other string fields in info
        for k, v in info.items():
            if isinstance(v, str):
                d = extract_doi_from_string(v)
                if d:
                    return d
            if isinstance(v, list):
                for item in v:
                    if isinstance(item, str):
                        d = extract_doi_from_string(item)
                        if d:
                            return d
    # record-level url
    if isinstance(record, dict):
        for key in ('url', 'ee'):
            val = record.get(key)
            if val and isinstance(val, str):
                d = extract_doi_from_string(val)
                if d:
                    return d
    return None
 def get_metadata_from_semantic(doi, retries=3):
    # doi should be the bare DOI (not full url)
    url = SEMANTIC_API.format(doi)
    for attempt in range(1, retries + 1):
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code == 200:
                return resp.json()
            if resp.status_code == 429:
                # rate limited: back off
                time.sleep(1.0 * attempt)
            else:
                # other non-200 -> break early for most cases
                return None
        except requests.RequestException:
            time.sleep(0.5 * attempt)
    return None
 def write_markdown_entry(f, paper):
    title = paper.get('title') or paper.get('paperTitle') or 'No Title'
    f.write(f"## {title}\n\n")
    abstract = paper.get('abstract', '')
    if abstract:
        f.write("**Abstract**:\n\n")
        f.write(abstract.strip() + "\n\n")
    else:
        f.write("_No abstract available._\n\n")
    f.write("\n---\n\n")
 def main():
    parser = argparse.ArgumentParser(
        description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts')
    parser.add_argument('input_json', help='Input JSON file (DBLP-like)')
    parser.add_argument('output_md', help='Output markdown file')
    parser.add_argument('--years', default='2024,2025',
                        help='Comma-separated years to include (default: 2024,2025)')
    parser.add_argument('--sleep', type=float, default=0.1,
                        help='Sleep seconds between API calls (default 0.1)')
    args = parser.parse_args()
    years = set()
    for y in args.years.split(','):
        y = y.strip()
        if y:
            try:
                years.add(int(y))
            except ValueError:
                pass
    data = load_json(args.input_json)
    hits = find_hits(data)
    print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}')
    os.makedirs(os.path.dirname(args.output_md) if os.path.dirname(
        args.output_md) else '.', exist_ok=True)
    seen_dois = set()
    with open(args.output_md, 'w', encoding='utf-8') as out:
        count = 0
        for rec in hits:
            info = rec.get('info') if isinstance(rec, dict) else None
            year = None
            if info and isinstance(info, dict):
                y = info.get('year')
                if isinstance(y, str) and y.isdigit():
                    year = int(y)
                elif isinstance(y, int):
                    year = y
            if year not in years:
                continue
            doi = extract_doi(rec)
            if not doi:
                # write a minimal entry using title if no DOI
                title = info.get('title') if info else rec.get(
                    'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}')
                out.write(
                    f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n")
                continue
            # normalize doi (strip url prefix if present)
            doi = doi.split('doi.org/')[-1]
            doi = doi.strip()
            if doi in seen_dois:
                continue
            seen_dois.add(doi)
            count += 1
            print(f'[{count}] Fetching DOI: {doi} (year={year})')
            meta = get_metadata_from_semantic(doi)
            if meta:
                write_markdown_entry(out, meta)
            else:
                # fallback: write title from info
                title = info.get('title') if info else f'DOI: {doi}'
                out.write(
                    f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n")
            time.sleep(args.sleep)
    print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}')
 if __name__ == '__main__':
    main()
--- a/crawl_paper.py
+++ b/crawl_paper.py
@@ -0,0 +1,80 @@
 import requests
 from bs4 import BeautifulSoup
 import time
 import os
 import argparse
 # Semantic Scholar API URL（无需API Key）
 SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
 def get_dois_from_dblp(dblp_url):
    resp = requests.get(dblp_url)
    soup = BeautifulSoup(resp.text, "html.parser")
    doi_links = []
    seen = set()
    for article in soup.find_all("li", class_="entry"):
        for link in article.find_all("a", href=True):
            href = link['href']
            if "doi.org" in href and "bsky.app" not in href:
                if href.startswith("https://doi.org/"):
                    doi = href.strip()
                    if doi not in seen:
                        seen.add(doi)
                        doi_links.append(doi)
    return doi_links  # 保持原顺序
 def get_metadata_from_semantic(doi):
    url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code == 200:
            return resp.json()
    except Exception as e:
        print(f"Error fetching {doi}: {e}")
    return None
 def write_markdown_entry(f, paper):
    f.write(f"## {paper.get('title', 'No Title')}\n\n")
    abstract = paper.get("abstract", "")
    if abstract:
        f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
    else:
        f.write(f"_No abstract available._\n\n")
    f.write("\n---\n\n")
 def main():
    parser = argparse.ArgumentParser(
        description='Crawl paper abstracts from dblp and Semantic Scholar')
    parser.add_argument('dblp_url', help='DBLP conference URL')
    parser.add_argument('output_file', help='Output markdown file path')
    args = parser.parse_args()
    # 确保输出目录存在
    os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
        args.output_file) else '.', exist_ok=True)
    dois = get_dois_from_dblp(args.dblp_url)
    print(f"Found {len(dois)} DOIs.")
    with open(args.output_file, "w", encoding="utf-8") as f:
        for i, doi in enumerate(dois, 1):
            print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
            metadata = get_metadata_from_semantic(doi)
            if metadata:
                write_markdown_entry(f, metadata)
            else:
                write_markdown_entry(f, {"title": f"DOI: {doi}"})
            time.sleep(0.1)  # 避免过快请求被封
    print(f"\nDone. Saved to: {args.output_file}")
 if __name__ == "__main__":
    main()
--- a/rss.py
+++ b/rss.py
@@ -0,0 +1,21 @@
 import feedparser
 # 示例 RSS 源
 rss_url = "https://rss.cnki.net/knavi/rss/RJXB?pcode=CJFD,CCJD"
 # 解析 RSS
 feed = feedparser.parse(rss_url)
 # 检查是否解析成功
 if feed.bozo:
    print("警告：RSS 解析可能有问题")
 # 输出 Feed 标题
 print("Feed 标题：", feed.feed.get('title', '无标题'))
 # 输出前5条条目
 for entry in feed.entries:
    print("标题：", entry.title)
    print("链接：", entry.link)
    print("发布时间：", entry.get('published', '无时间'))
    print("-" * 40)
		`@@ -0,0 +1,3 @@`
							`# paper-crawler`

							`基于dblp和Semantic Scholar API爬取论文摘要。`