first commit

This commit is contained in:
2026-01-11 16:48:49 +08:00
commit 360a369d02
5 changed files with 500 additions and 0 deletions

195
.gitignore vendored Normal file
View File

@@ -0,0 +1,195 @@
# custom
papers/
json/
.vscode/
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# ---> VisualStudioCode
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix

3
README.md Normal file
View File

@@ -0,0 +1,3 @@
# paper-crawler
基于dblp和Semantic Scholar API爬取论文摘要。

201
crawl_json.py Normal file
View File

@@ -0,0 +1,201 @@
import json
import requests
import time
import os
import argparse
import re
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
def load_json(path):
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def find_hits(data):
# 尝试多种常见路径定位记录列表
if isinstance(data, dict):
# result.hits.hit
r = data.get('result')
if r and isinstance(r, dict):
h = r.get('hits')
if h and isinstance(h, dict):
hit = h.get('hit')
if isinstance(hit, list):
return hit
# direct hits or hit
if 'hit' in data and isinstance(data['hit'], list):
return data['hit']
if 'hits' in data and isinstance(data['hits'], list):
return data['hits']
# fallback: if top-level is list
if isinstance(data, list):
return data
return []
def extract_doi_from_string(s):
if not s or not isinstance(s, str):
return None
# common DOI patterns
m = re.search(r'(10\.\d{4,9}/[^\s"<>]+)', s)
if m:
return m.group(1).rstrip('.,')
# fallback: look for doi.org/...
m2 = re.search(r'doi\.org/(.+)', s)
if m2:
return m2.group(1).rstrip('.,')
return None
def extract_doi(record):
# record may be dict with 'info'
info = record.get('info') if isinstance(record, dict) else None
# direct doi field
if info and isinstance(info, dict):
doi = info.get('doi')
if doi:
return doi
# ee sometimes contains doi url
ee = info.get('ee')
if ee:
if isinstance(ee, list):
for e in ee:
d = extract_doi_from_string(e)
if d:
return d
else:
d = extract_doi_from_string(ee)
if d:
return d
# scan other string fields in info
for k, v in info.items():
if isinstance(v, str):
d = extract_doi_from_string(v)
if d:
return d
if isinstance(v, list):
for item in v:
if isinstance(item, str):
d = extract_doi_from_string(item)
if d:
return d
# record-level url
if isinstance(record, dict):
for key in ('url', 'ee'):
val = record.get(key)
if val and isinstance(val, str):
d = extract_doi_from_string(val)
if d:
return d
return None
def get_metadata_from_semantic(doi, retries=3):
# doi should be the bare DOI (not full url)
url = SEMANTIC_API.format(doi)
for attempt in range(1, retries + 1):
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
return resp.json()
if resp.status_code == 429:
# rate limited: back off
time.sleep(1.0 * attempt)
else:
# other non-200 -> break early for most cases
return None
except requests.RequestException:
time.sleep(0.5 * attempt)
return None
def write_markdown_entry(f, paper):
title = paper.get('title') or paper.get('paperTitle') or 'No Title'
f.write(f"## {title}\n\n")
abstract = paper.get('abstract', '')
if abstract:
f.write("**Abstract**:\n\n")
f.write(abstract.strip() + "\n\n")
else:
f.write("_No abstract available._\n\n")
f.write("\n---\n\n")
def main():
parser = argparse.ArgumentParser(
description='Parse JSON DBLP export, fetch 2024/2025 DOIs and get abstracts')
parser.add_argument('input_json', help='Input JSON file (DBLP-like)')
parser.add_argument('output_md', help='Output markdown file')
parser.add_argument('--years', default='2024,2025',
help='Comma-separated years to include (default: 2024,2025)')
parser.add_argument('--sleep', type=float, default=0.1,
help='Sleep seconds between API calls (default 0.1)')
args = parser.parse_args()
years = set()
for y in args.years.split(','):
y = y.strip()
if y:
try:
years.add(int(y))
except ValueError:
pass
data = load_json(args.input_json)
hits = find_hits(data)
print(f'Found {len(hits)} records in JSON. Filtering years: {sorted(years)}')
os.makedirs(os.path.dirname(args.output_md) if os.path.dirname(
args.output_md) else '.', exist_ok=True)
seen_dois = set()
with open(args.output_md, 'w', encoding='utf-8') as out:
count = 0
for rec in hits:
info = rec.get('info') if isinstance(rec, dict) else None
year = None
if info and isinstance(info, dict):
y = info.get('year')
if isinstance(y, str) and y.isdigit():
year = int(y)
elif isinstance(y, int):
year = y
if year not in years:
continue
doi = extract_doi(rec)
if not doi:
# write a minimal entry using title if no DOI
title = info.get('title') if info else rec.get(
'title', f'No title for id {rec.get("@id") if isinstance(rec, dict) else "?"}')
out.write(
f"## {title}\n\n_No DOI found; skipped Semantic Scholar lookup._\n\n\n---\n\n")
continue
# normalize doi (strip url prefix if present)
doi = doi.split('doi.org/')[-1]
doi = doi.strip()
if doi in seen_dois:
continue
seen_dois.add(doi)
count += 1
print(f'[{count}] Fetching DOI: {doi} (year={year})')
meta = get_metadata_from_semantic(doi)
if meta:
write_markdown_entry(out, meta)
else:
# fallback: write title from info
title = info.get('title') if info else f'DOI: {doi}'
out.write(
f"## {title}\n\n_No abstract available or API lookup failed for DOI: {doi}._\n\n\n---\n\n")
time.sleep(args.sleep)
print(f'Done. Wrote {len(seen_dois)} DOIs to {args.output_md}')
if __name__ == '__main__':
main()

80
crawl_paper.py Normal file
View File

@@ -0,0 +1,80 @@
import requests
from bs4 import BeautifulSoup
import time
import os
import argparse
# Semantic Scholar API URL无需API Key
SEMANTIC_API = "https://api.semanticscholar.org/graph/v1/paper/DOI:{}?fields=title,abstract"
def get_dois_from_dblp(dblp_url):
resp = requests.get(dblp_url)
soup = BeautifulSoup(resp.text, "html.parser")
doi_links = []
seen = set()
for article in soup.find_all("li", class_="entry"):
for link in article.find_all("a", href=True):
href = link['href']
if "doi.org" in href and "bsky.app" not in href:
if href.startswith("https://doi.org/"):
doi = href.strip()
if doi not in seen:
seen.add(doi)
doi_links.append(doi)
return doi_links # 保持原顺序
def get_metadata_from_semantic(doi):
url = SEMANTIC_API.format(doi.split("doi.org/")[-1])
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
return resp.json()
except Exception as e:
print(f"Error fetching {doi}: {e}")
return None
def write_markdown_entry(f, paper):
f.write(f"## {paper.get('title', 'No Title')}\n\n")
abstract = paper.get("abstract", "")
if abstract:
f.write(f"**Abstract**:\n\n{abstract.strip()}\n\n")
else:
f.write(f"_No abstract available._\n\n")
f.write("\n---\n\n")
def main():
parser = argparse.ArgumentParser(
description='Crawl paper abstracts from dblp and Semantic Scholar')
parser.add_argument('dblp_url', help='DBLP conference URL')
parser.add_argument('output_file', help='Output markdown file path')
args = parser.parse_args()
# 确保输出目录存在
os.makedirs(os.path.dirname(args.output_file) if os.path.dirname(
args.output_file) else '.', exist_ok=True)
dois = get_dois_from_dblp(args.dblp_url)
print(f"Found {len(dois)} DOIs.")
with open(args.output_file, "w", encoding="utf-8") as f:
for i, doi in enumerate(dois, 1):
print(f"[{i}/{len(dois)}] Fetching metadata for {doi}")
metadata = get_metadata_from_semantic(doi)
if metadata:
write_markdown_entry(f, metadata)
else:
write_markdown_entry(f, {"title": f"DOI: {doi}"})
time.sleep(0.1) # 避免过快请求被封
print(f"\nDone. Saved to: {args.output_file}")
if __name__ == "__main__":
main()

21
rss.py Normal file
View File

@@ -0,0 +1,21 @@
import feedparser
# 示例 RSS 源
rss_url = "https://rss.cnki.net/knavi/rss/RJXB?pcode=CJFD,CCJD"
# 解析 RSS
feed = feedparser.parse(rss_url)
# 检查是否解析成功
if feed.bozo:
print("警告RSS 解析可能有问题")
# 输出 Feed 标题
print("Feed 标题:", feed.feed.get('title', '无标题'))
# 输出前5条条目
for entry in feed.entries:
print("标题:", entry.title)
print("链接:", entry.link)
print("发布时间:", entry.get('published', '无时间'))
print("-" * 40)