VisionXLab · Caorui-Li · May 11, 2026
diff --git a/.gitignore b/.gitignore
@@ -29,6 +29,7 @@ data/result-*/
 data/cache/
 config.json
 .superpowers/
+autor-test/.env
 
 # Local-only files (Claude instructions)
 CLAUDE.md

diff --git a/citationclaw/app/config_manager.py b/citationclaw/app/config_manager.py
@@ -140,6 +140,9 @@ class AppConfig(BaseModel):
     # Semantic Scholar API Key (提升速率限制: 1 req/s → 10-100 req/s)
     s2_api_key: str = Field(default="", description="Semantic Scholar API Key（可选，大幅提升 PDF 下载成功率）")
 
+    # Web of Science Starter API Key (结构化作者提取)
+    wos_api_key: str = Field(default="", description="Web of Science Starter API Key（用于结构化作者提取，优先级高于 S2）")
+
     # MinerU Cloud API
     mineru_api_token: str = Field(default="", description="MinerU Cloud Precision API Token（可选，用于大文件解析）")
 

diff --git a/citationclaw/app/main.py b/citationclaw/app/main.py
@@ -133,6 +133,7 @@ class ConfigUpdate(BaseModel):
     dashboard_skip_citing_analysis: bool = False
     dashboard_model: str = "gemini-3-flash-preview-nothinking"
     s2_api_key: str = ""
+    wos_api_key: str = ""
     mineru_api_token: str = ""
     cdp_debug_port: int = 0
     api_access_token: str = ""
@@ -165,7 +166,6 @@ async def get_providers():
 async def save_config(config: ConfigUpdate):
     try:
         data = config.model_dump()
-        # Debug: log MinerU token save status
         token = data.get("mineru_api_token", "")
         if token:
             print(f"[CONFIG] MinerU token 已保存: {token[:8]}...({len(token)} chars)")

diff --git a/citationclaw/app/task_executor.py b/citationclaw/app/task_executor.py
@@ -88,6 +88,21 @@ async def _run_new_phase2_and_3(
         collector = MetadataCollector(
             s2_api_key=getattr(config, 's2_api_key', None),
         )
+
+        # 结构化作者提取：WOS→S2→MinerU（有 wos_api_key 时启用）
+        _wos_key = getattr(config, 'wos_api_key', '') or ''
+        _s2_key_for_fetcher = getattr(config, 's2_api_key', '') or ''
+        from citationclaw.core.structured_author_fetcher import StructuredAuthorFetcher
+        structured_fetcher: Optional[StructuredAuthorFetcher] = None
+        if _wos_key:
+            structured_fetcher = StructuredAuthorFetcher(
+                wos_api_key=_wos_key,
+                s2_api_key=_s2_key_for_fetcher,
+                log_callback=self.log_manager.info,
+            )
+            self.log_manager.info(f"📋 WOS 结构化作者提取已启用（WOS + S2 双源融合）")
+        else:
+            self.log_manager.info("⚪ 未配置 WOS Key，使用默认 S2+OpenAlex 流程")
         self_cite_detector = SelfCitationDetector()
         prefilter = ScholarPreFilter()
 
@@ -173,11 +188,50 @@ async def _fetch_one(idx: int, paper: dict, canonical: str):
                         metadata = cached
                         api_hits += 1
                     else:
-                        # S2-first: search by title, then by URL if title miss
-                        metadata = await collector.collect(title, paper_url=paper_link)
+                        # WOS→S2→OpenAlex 结构化提取（有 wos_api_key 时优先）
+                        wos_authors = []
+                        wos_source = ""
+                        if structured_fetcher:
+                            try:
+                                wos_authors, wos_source = await structured_fetcher.fetch(title)
+                            except Exception:
+                                wos_authors = []
+
+                        if wos_authors:
+                            # WOS/S2 结构化结果直接使用，跳过 collector
+                            metadata = {
+                                "title": title,
+                                "doi": "", "s2_id": "", "arxiv_id": "",
+                                "year": paper.get("paper_year"),
+                                "cited_by_count": 0, "influential_citation_count": 0,
+                                "pdf_url": "", "oa_pdf_url": "", "venue": "",
+                                "authors": wos_authors,
+                                "sources": [wos_source],
+                            }
+                        else:
+                            # Fallback: S2-first via collector
+                            metadata = await collector.collect(title, paper_url=paper_link)
+
                         if metadata:
                             await metadata_cache.update(metadata.get("doi", ""), title, metadata)
                         api_queries += 1
+
+                    # ── DEBUG: per-paper author detail ──
+                    if metadata:
+                        _src = ",".join(metadata.get("sources", [])) or "?"
+                        _authors = metadata.get("authors", [])
+                        self.log_manager.info(
+                            f"  ┌─[{_src}] {title[:52]}"
+                        )
+                        for _a in _authors[:10]:
+                            _n = _a.get("name", "?")
+                            _af = (_a.get("affiliation", "") or "—")[:45]
+                            _afsrc = f" [{_a['affiliation_source']}]" if _a.get("affiliation_source") else ""
+                            self.log_manager.info(f"  │  {_n} | {_af}{_afsrc}")
+                        if len(_authors) > 10:
+                            self.log_manager.info(f"  │  …共 {len(_authors)} 位")
+                        self.log_manager.info("  └─")
+
                     results_slots[idx] = metadata
                 except Exception as e:
                     # Don't let one paper's API failure crash the entire batch
@@ -267,6 +321,36 @@ async def _fetch_one(idx: int, paper: dict, canonical: str):
             f"Phase 2 完成: API找到 {api_found} / GS兜底 {gs_fallback_count} / "
             f"缓存 {api_hits} / 共 {len(records_data)} 篇"
         )
+
+        # ── DEBUG: save per-paper author breakdown to JSON ──
+        try:
+            import json as _dbg_json
+            _debug_records = []
+            for _i, (_paper, _meta, _canon) in enumerate(records_data):
+                _debug_records.append({
+                    "idx": _i + 1,
+                    "title": _paper.get("paper_title", ""),
+                    "canonical": _canon,
+                    "source": ",".join((_meta or {}).get("sources", [])) or "gs_fallback",
+                    "authors": [
+                        {
+                            "name": _a.get("name", ""),
+                            "affiliation": _a.get("affiliation", ""),
+                            "affiliation_source": _a.get("affiliation_source", ""),
+                            "s2_id": _a.get("s2_id", ""),
+                        }
+                        for _a in (_meta or {}).get("authors", [])
+                    ],
+                })
+            _debug_file = result_dir / f"{output_prefix}_author_debug.json"
+            _debug_file.write_text(
+                _dbg_json.dumps(_debug_records, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+            self.log_manager.info(f"📄 作者调试文件: {_debug_file}")
+        except Exception as _e:
+            self.log_manager.warning(f"调试文件保存失败: {_e}")
+
         if gs_fallback_count > len(records_data) * 0.5:
             self.log_manager.warning(
                 f"⚠ {gs_fallback_count} 篇论文 API 未找到（S2/OpenAlex 均未收录），"

diff --git a/citationclaw/config/prompts/pdf_author_extract.txt b/citationclaw/config/prompts/pdf_author_extract.txt
@@ -1,14 +1,15 @@
-以下是一篇学术论文首页的文本块（来自 PDF 解析，按排版顺序排列）：
+以下是一篇学术论文的文本块（来自 PDF 解析，按排版顺序排列，可能包含多页内容）：
 
 {first_page_text}
 
 请从中提取所有作者及其单位信息。
 
 要求：
-1. 只提取论文首页中明确列出的作者和单位
-2. 注意区分作者名和其他文本（如标题、摘要、关键词）
+1. 若首页仅显示 team 名称（如 "ABC Team", "Research Group XYZ"）而非具体人名，请在后续文本块中查找完整的作者个人名单
+2. 只提取论文中明确列出的作者和单位，不允许捏造，不允许混入参考文献中的作者
 3. 如果有对应的邮箱，也请提取
 4. 机构名请用论文中写的原文（不要翻译）
+5. 姓名请使用完整格式（名 姓），不要使用逗号分隔格式
 
 以 JSON 数组格式输出：
 [

diff --git a/citationclaw/core/affiliation_validator.py b/citationclaw/core/affiliation_validator.py
@@ -1,14 +1,8 @@
-"""Cross-validate author affiliations between API data and PDF-extracted data.
-
-Strategy:
-- Match authors by name (fuzzy, handles Chinese/English variants)
-- PDF affiliation = publication-time truth (preferred)
-- API affiliation = current affiliation (may have changed)
-- Merge: PDF > API Author-level > API paper-level > empty
-"""
 import re
 from typing import List, Optional
 
+from citationclaw.core.author_name_utils import format_wos_name, name_keys
+
 
 class AffiliationValidator:
     """Cross-validate and merge author data from API and PDF sources."""
@@ -26,7 +20,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic
         if not pdf_authors:
             return api_authors
         if not api_authors:
-            return [{"name": a["name"], "affiliation": a.get("affiliation", ""),
+            return [{"name": format_wos_name(a["name"]) or a["name"], "affiliation": a.get("affiliation", ""),
                       "country": "", "affiliation_source": "pdf"}
                     for a in pdf_authors]
 
@@ -79,7 +73,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic
             if not (pdf_keys & matched_pdf_names):
                 pdf_affil = pdf_a.get("affiliation", "")
                 merged.append({
-                    "name": pdf_a["name"],
+                    "name": format_wos_name(pdf_a["name"]) or pdf_a["name"],
                     "affiliation": pdf_affil,
                     "email": pdf_a.get("email", ""),
                     "country": self._infer_country(pdf_affil),
@@ -115,7 +109,8 @@ def _infer_country(affiliation: str) -> str:
         if any(k in aff for k in cn_kw):
             return "CN"
         # US institutions
-        us_kw = ["mit ", "m.i.t", "stanford", "harvard", "berkeley",
+        us_kw = ["mit", "m.i.t", "massachusetts institute of technology",
+                 "stanford", "harvard", "berkeley",
                  "carnegie mellon", "cmu", "princeton", "yale",
                  "columbia university", "cornell", "ucla", "caltech",
                  "university of california", "university of michigan",
@@ -163,18 +158,18 @@ def _infer_country(affiliation: str) -> str:
 
     @staticmethod
     def _name_keys(name: str) -> set:
-        """Extract all name variants for matching (same logic as scholar dedup)."""
         keys = set()
         cleaned = name.strip()
         if not cleaned:
             return keys
-        # Split on parentheses and slashes
+
         parts = re.split(r'[()（）/／]', cleaned)
         for part in parts:
-            p = part.strip().strip(',，、').strip()
-            if p and len(p) >= 2:
-                keys.add(p.lower())
+            part = part.strip().strip(',，、').strip()
+            if part and len(part) >= 2:
+                keys.update(name_keys(part))
+
         base = re.sub(r'[（(].*?[）)]', '', cleaned).strip()
         if base and len(base) >= 2:
-            keys.add(base.lower())
+            keys.update(name_keys(base))
         return keys