Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ data/result-*/
data/cache/
config.json
.superpowers/
autor-test/.env

# Local-only files (Claude instructions)
CLAUDE.md
Expand Down
3 changes: 3 additions & 0 deletions citationclaw/app/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ class AppConfig(BaseModel):
# Semantic Scholar API Key (提升速率限制: 1 req/s → 10-100 req/s)
s2_api_key: str = Field(default="", description="Semantic Scholar API Key(可选,大幅提升 PDF 下载成功率)")

# Web of Science Starter API Key (结构化作者提取)
wos_api_key: str = Field(default="", description="Web of Science Starter API Key(用于结构化作者提取,优先级高于 S2)")

# MinerU Cloud API
mineru_api_token: str = Field(default="", description="MinerU Cloud Precision API Token(可选,用于大文件解析)")

Expand Down
2 changes: 1 addition & 1 deletion citationclaw/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class ConfigUpdate(BaseModel):
dashboard_skip_citing_analysis: bool = False
dashboard_model: str = "gemini-3-flash-preview-nothinking"
s2_api_key: str = ""
wos_api_key: str = ""
mineru_api_token: str = ""
cdp_debug_port: int = 0
api_access_token: str = ""
Expand Down Expand Up @@ -165,7 +166,6 @@ async def get_providers():
async def save_config(config: ConfigUpdate):
try:
data = config.model_dump()
# Debug: log MinerU token save status
token = data.get("mineru_api_token", "")
if token:
print(f"[CONFIG] MinerU token 已保存: {token[:8]}...({len(token)} chars)")
Expand Down
88 changes: 86 additions & 2 deletions citationclaw/app/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,21 @@ async def _run_new_phase2_and_3(
collector = MetadataCollector(
s2_api_key=getattr(config, 's2_api_key', None),
)

# 结构化作者提取:WOS→S2→MinerU(有 wos_api_key 时启用)
_wos_key = getattr(config, 'wos_api_key', '') or ''
_s2_key_for_fetcher = getattr(config, 's2_api_key', '') or ''
from citationclaw.core.structured_author_fetcher import StructuredAuthorFetcher
structured_fetcher: Optional[StructuredAuthorFetcher] = None
if _wos_key:
structured_fetcher = StructuredAuthorFetcher(
wos_api_key=_wos_key,
s2_api_key=_s2_key_for_fetcher,
log_callback=self.log_manager.info,
)
self.log_manager.info(f"📋 WOS 结构化作者提取已启用(WOS + S2 双源融合)")
else:
self.log_manager.info("⚪ 未配置 WOS Key,使用默认 S2+OpenAlex 流程")
self_cite_detector = SelfCitationDetector()
prefilter = ScholarPreFilter()

Expand Down Expand Up @@ -173,11 +188,50 @@ async def _fetch_one(idx: int, paper: dict, canonical: str):
metadata = cached
api_hits += 1
else:
# S2-first: search by title, then by URL if title miss
metadata = await collector.collect(title, paper_url=paper_link)
# WOS→S2→OpenAlex 结构化提取(有 wos_api_key 时优先)
wos_authors = []
wos_source = ""
if structured_fetcher:
try:
wos_authors, wos_source = await structured_fetcher.fetch(title)
except Exception:
wos_authors = []

if wos_authors:
# WOS/S2 结构化结果直接使用,跳过 collector
metadata = {
"title": title,
"doi": "", "s2_id": "", "arxiv_id": "",
"year": paper.get("paper_year"),
"cited_by_count": 0, "influential_citation_count": 0,
"pdf_url": "", "oa_pdf_url": "", "venue": "",
"authors": wos_authors,
"sources": [wos_source],
}
else:
# Fallback: S2-first via collector
metadata = await collector.collect(title, paper_url=paper_link)

if metadata:
await metadata_cache.update(metadata.get("doi", ""), title, metadata)
api_queries += 1

# ── DEBUG: per-paper author detail ──
if metadata:
_src = ",".join(metadata.get("sources", [])) or "?"
_authors = metadata.get("authors", [])
self.log_manager.info(
f" ┌─[{_src}] {title[:52]}"
)
for _a in _authors[:10]:
_n = _a.get("name", "?")
_af = (_a.get("affiliation", "") or "—")[:45]
_afsrc = f" [{_a['affiliation_source']}]" if _a.get("affiliation_source") else ""
self.log_manager.info(f" │ {_n} | {_af}{_afsrc}")
if len(_authors) > 10:
self.log_manager.info(f" │ …共 {len(_authors)} 位")
self.log_manager.info(" └─")

results_slots[idx] = metadata
except Exception as e:
# Don't let one paper's API failure crash the entire batch
Expand Down Expand Up @@ -267,6 +321,36 @@ async def _fetch_one(idx: int, paper: dict, canonical: str):
f"Phase 2 完成: API找到 {api_found} / GS兜底 {gs_fallback_count} / "
f"缓存 {api_hits} / 共 {len(records_data)} 篇"
)

# ── DEBUG: save per-paper author breakdown to JSON ──
try:
import json as _dbg_json
_debug_records = []
for _i, (_paper, _meta, _canon) in enumerate(records_data):
_debug_records.append({
"idx": _i + 1,
"title": _paper.get("paper_title", ""),
"canonical": _canon,
"source": ",".join((_meta or {}).get("sources", [])) or "gs_fallback",
"authors": [
{
"name": _a.get("name", ""),
"affiliation": _a.get("affiliation", ""),
"affiliation_source": _a.get("affiliation_source", ""),
"s2_id": _a.get("s2_id", ""),
}
for _a in (_meta or {}).get("authors", [])
],
})
_debug_file = result_dir / f"{output_prefix}_author_debug.json"
_debug_file.write_text(
_dbg_json.dumps(_debug_records, ensure_ascii=False, indent=2),
encoding="utf-8",
)
self.log_manager.info(f"📄 作者调试文件: {_debug_file}")
except Exception as _e:
self.log_manager.warning(f"调试文件保存失败: {_e}")

if gs_fallback_count > len(records_data) * 0.5:
self.log_manager.warning(
f"⚠ {gs_fallback_count} 篇论文 API 未找到(S2/OpenAlex 均未收录),"
Expand Down
7 changes: 4 additions & 3 deletions citationclaw/config/prompts/pdf_author_extract.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
以下是一篇学术论文首页的文本块(来自 PDF 解析,按排版顺序排列):
以下是一篇学术论文的文本块(来自 PDF 解析,按排版顺序排列,可能包含多页内容):

{first_page_text}

请从中提取所有作者及其单位信息。

要求:
1. 只提取论文首页中明确列出的作者和单位
2. 注意区分作者名和其他文本(如标题、摘要、关键词)
1. 若首页仅显示 team 名称(如 "ABC Team", "Research Group XYZ")而非具体人名,请在后续文本块中查找完整的作者个人名单
2. 只提取论文中明确列出的作者和单位,不允许捏造,不允许混入参考文献中的作者
3. 如果有对应的邮箱,也请提取
4. 机构名请用论文中写的原文(不要翻译)
5. 姓名请使用完整格式(名 姓),不要使用逗号分隔格式

以 JSON 数组格式输出:
[
Expand Down
29 changes: 12 additions & 17 deletions citationclaw/core/affiliation_validator.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
"""Cross-validate author affiliations between API data and PDF-extracted data.

Strategy:
- Match authors by name (fuzzy, handles Chinese/English variants)
- PDF affiliation = publication-time truth (preferred)
- API affiliation = current affiliation (may have changed)
- Merge: PDF > API Author-level > API paper-level > empty
"""
import re
from typing import List, Optional

from citationclaw.core.author_name_utils import format_wos_name, name_keys


class AffiliationValidator:
"""Cross-validate and merge author data from API and PDF sources."""
Expand All @@ -26,7 +20,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic
if not pdf_authors:
return api_authors
if not api_authors:
return [{"name": a["name"], "affiliation": a.get("affiliation", ""),
return [{"name": format_wos_name(a["name"]) or a["name"], "affiliation": a.get("affiliation", ""),
"country": "", "affiliation_source": "pdf"}
for a in pdf_authors]

Expand Down Expand Up @@ -79,7 +73,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic
if not (pdf_keys & matched_pdf_names):
pdf_affil = pdf_a.get("affiliation", "")
merged.append({
"name": pdf_a["name"],
"name": format_wos_name(pdf_a["name"]) or pdf_a["name"],
"affiliation": pdf_affil,
"email": pdf_a.get("email", ""),
"country": self._infer_country(pdf_affil),
Expand Down Expand Up @@ -115,7 +109,8 @@ def _infer_country(affiliation: str) -> str:
if any(k in aff for k in cn_kw):
return "CN"
# US institutions
us_kw = ["mit ", "m.i.t", "stanford", "harvard", "berkeley",
us_kw = ["mit", "m.i.t", "massachusetts institute of technology",
"stanford", "harvard", "berkeley",
"carnegie mellon", "cmu", "princeton", "yale",
"columbia university", "cornell", "ucla", "caltech",
"university of california", "university of michigan",
Expand Down Expand Up @@ -163,18 +158,18 @@ def _infer_country(affiliation: str) -> str:

@staticmethod
def _name_keys(name: str) -> set:
"""Extract all name variants for matching (same logic as scholar dedup)."""
keys = set()
cleaned = name.strip()
if not cleaned:
return keys
# Split on parentheses and slashes

parts = re.split(r'[()()//]', cleaned)
for part in parts:
p = part.strip().strip(',,、').strip()
if p and len(p) >= 2:
keys.add(p.lower())
part = part.strip().strip(',,、').strip()
if part and len(part) >= 2:
keys.update(name_keys(part))

base = re.sub(r'[((].*?[))]', '', cleaned).strip()
if base and len(base) >= 2:
keys.add(base.lower())
keys.update(name_keys(base))
return keys
Loading