diff --git a/.gitignore b/.gitignore index 09c2e03..ae00707 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ data/result-*/ data/cache/ config.json .superpowers/ +autor-test/.env # Local-only files (Claude instructions) CLAUDE.md diff --git a/citationclaw/app/config_manager.py b/citationclaw/app/config_manager.py index 689a1cf..c26ef30 100644 --- a/citationclaw/app/config_manager.py +++ b/citationclaw/app/config_manager.py @@ -140,6 +140,9 @@ class AppConfig(BaseModel): # Semantic Scholar API Key (提升速率限制: 1 req/s → 10-100 req/s) s2_api_key: str = Field(default="", description="Semantic Scholar API Key(可选,大幅提升 PDF 下载成功率)") + # Web of Science Starter API Key (结构化作者提取) + wos_api_key: str = Field(default="", description="Web of Science Starter API Key(用于结构化作者提取,优先级高于 S2)") + # MinerU Cloud API mineru_api_token: str = Field(default="", description="MinerU Cloud Precision API Token(可选,用于大文件解析)") diff --git a/citationclaw/app/main.py b/citationclaw/app/main.py index 78bca2e..2d498df 100644 --- a/citationclaw/app/main.py +++ b/citationclaw/app/main.py @@ -133,6 +133,7 @@ class ConfigUpdate(BaseModel): dashboard_skip_citing_analysis: bool = False dashboard_model: str = "gemini-3-flash-preview-nothinking" s2_api_key: str = "" + wos_api_key: str = "" mineru_api_token: str = "" cdp_debug_port: int = 0 api_access_token: str = "" @@ -165,7 +166,6 @@ async def get_providers(): async def save_config(config: ConfigUpdate): try: data = config.model_dump() - # Debug: log MinerU token save status token = data.get("mineru_api_token", "") if token: print(f"[CONFIG] MinerU token 已保存: {token[:8]}...({len(token)} chars)") diff --git a/citationclaw/app/task_executor.py b/citationclaw/app/task_executor.py index 81f7d8f..f527687 100644 --- a/citationclaw/app/task_executor.py +++ b/citationclaw/app/task_executor.py @@ -88,6 +88,21 @@ async def _run_new_phase2_and_3( collector = MetadataCollector( s2_api_key=getattr(config, 's2_api_key', None), ) + + # 结构化作者提取:WOS→S2→MinerU(有 wos_api_key 时启用) + _wos_key = getattr(config, 'wos_api_key', '') or '' + _s2_key_for_fetcher = getattr(config, 's2_api_key', '') or '' + from citationclaw.core.structured_author_fetcher import StructuredAuthorFetcher + structured_fetcher: Optional[StructuredAuthorFetcher] = None + if _wos_key: + structured_fetcher = StructuredAuthorFetcher( + wos_api_key=_wos_key, + s2_api_key=_s2_key_for_fetcher, + log_callback=self.log_manager.info, + ) + self.log_manager.info(f"📋 WOS 结构化作者提取已启用(WOS + S2 双源融合)") + else: + self.log_manager.info("⚪ 未配置 WOS Key,使用默认 S2+OpenAlex 流程") self_cite_detector = SelfCitationDetector() prefilter = ScholarPreFilter() @@ -173,11 +188,50 @@ async def _fetch_one(idx: int, paper: dict, canonical: str): metadata = cached api_hits += 1 else: - # S2-first: search by title, then by URL if title miss - metadata = await collector.collect(title, paper_url=paper_link) + # WOS→S2→OpenAlex 结构化提取(有 wos_api_key 时优先) + wos_authors = [] + wos_source = "" + if structured_fetcher: + try: + wos_authors, wos_source = await structured_fetcher.fetch(title) + except Exception: + wos_authors = [] + + if wos_authors: + # WOS/S2 结构化结果直接使用,跳过 collector + metadata = { + "title": title, + "doi": "", "s2_id": "", "arxiv_id": "", + "year": paper.get("paper_year"), + "cited_by_count": 0, "influential_citation_count": 0, + "pdf_url": "", "oa_pdf_url": "", "venue": "", + "authors": wos_authors, + "sources": [wos_source], + } + else: + # Fallback: S2-first via collector + metadata = await collector.collect(title, paper_url=paper_link) + if metadata: await metadata_cache.update(metadata.get("doi", ""), title, metadata) api_queries += 1 + + # ── DEBUG: per-paper author detail ── + if metadata: + _src = ",".join(metadata.get("sources", [])) or "?" + _authors = metadata.get("authors", []) + self.log_manager.info( + f" ┌─[{_src}] {title[:52]}" + ) + for _a in _authors[:10]: + _n = _a.get("name", "?") + _af = (_a.get("affiliation", "") or "—")[:45] + _afsrc = f" [{_a['affiliation_source']}]" if _a.get("affiliation_source") else "" + self.log_manager.info(f" │ {_n} | {_af}{_afsrc}") + if len(_authors) > 10: + self.log_manager.info(f" │ …共 {len(_authors)} 位") + self.log_manager.info(" └─") + results_slots[idx] = metadata except Exception as e: # Don't let one paper's API failure crash the entire batch @@ -267,6 +321,36 @@ async def _fetch_one(idx: int, paper: dict, canonical: str): f"Phase 2 完成: API找到 {api_found} / GS兜底 {gs_fallback_count} / " f"缓存 {api_hits} / 共 {len(records_data)} 篇" ) + + # ── DEBUG: save per-paper author breakdown to JSON ── + try: + import json as _dbg_json + _debug_records = [] + for _i, (_paper, _meta, _canon) in enumerate(records_data): + _debug_records.append({ + "idx": _i + 1, + "title": _paper.get("paper_title", ""), + "canonical": _canon, + "source": ",".join((_meta or {}).get("sources", [])) or "gs_fallback", + "authors": [ + { + "name": _a.get("name", ""), + "affiliation": _a.get("affiliation", ""), + "affiliation_source": _a.get("affiliation_source", ""), + "s2_id": _a.get("s2_id", ""), + } + for _a in (_meta or {}).get("authors", []) + ], + }) + _debug_file = result_dir / f"{output_prefix}_author_debug.json" + _debug_file.write_text( + _dbg_json.dumps(_debug_records, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + self.log_manager.info(f"📄 作者调试文件: {_debug_file}") + except Exception as _e: + self.log_manager.warning(f"调试文件保存失败: {_e}") + if gs_fallback_count > len(records_data) * 0.5: self.log_manager.warning( f"⚠ {gs_fallback_count} 篇论文 API 未找到(S2/OpenAlex 均未收录)," diff --git a/citationclaw/config/prompts/pdf_author_extract.txt b/citationclaw/config/prompts/pdf_author_extract.txt index 21e45ef..14017c7 100644 --- a/citationclaw/config/prompts/pdf_author_extract.txt +++ b/citationclaw/config/prompts/pdf_author_extract.txt @@ -1,14 +1,15 @@ -以下是一篇学术论文首页的文本块(来自 PDF 解析,按排版顺序排列): +以下是一篇学术论文的文本块(来自 PDF 解析,按排版顺序排列,可能包含多页内容): {first_page_text} 请从中提取所有作者及其单位信息。 要求: -1. 只提取论文首页中明确列出的作者和单位 -2. 注意区分作者名和其他文本(如标题、摘要、关键词) +1. 若首页仅显示 team 名称(如 "ABC Team", "Research Group XYZ")而非具体人名,请在后续文本块中查找完整的作者个人名单 +2. 只提取论文中明确列出的作者和单位,不允许捏造,不允许混入参考文献中的作者 3. 如果有对应的邮箱,也请提取 4. 机构名请用论文中写的原文(不要翻译) +5. 姓名请使用完整格式(名 姓),不要使用逗号分隔格式 以 JSON 数组格式输出: [ diff --git a/citationclaw/core/affiliation_validator.py b/citationclaw/core/affiliation_validator.py index 71bf284..7c4d932 100644 --- a/citationclaw/core/affiliation_validator.py +++ b/citationclaw/core/affiliation_validator.py @@ -1,14 +1,8 @@ -"""Cross-validate author affiliations between API data and PDF-extracted data. - -Strategy: -- Match authors by name (fuzzy, handles Chinese/English variants) -- PDF affiliation = publication-time truth (preferred) -- API affiliation = current affiliation (may have changed) -- Merge: PDF > API Author-level > API paper-level > empty -""" import re from typing import List, Optional +from citationclaw.core.author_name_utils import format_wos_name, name_keys + class AffiliationValidator: """Cross-validate and merge author data from API and PDF sources.""" @@ -26,7 +20,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic if not pdf_authors: return api_authors if not api_authors: - return [{"name": a["name"], "affiliation": a.get("affiliation", ""), + return [{"name": format_wos_name(a["name"]) or a["name"], "affiliation": a.get("affiliation", ""), "country": "", "affiliation_source": "pdf"} for a in pdf_authors] @@ -79,7 +73,7 @@ def validate(self, api_authors: List[dict], pdf_authors: List[dict]) -> List[dic if not (pdf_keys & matched_pdf_names): pdf_affil = pdf_a.get("affiliation", "") merged.append({ - "name": pdf_a["name"], + "name": format_wos_name(pdf_a["name"]) or pdf_a["name"], "affiliation": pdf_affil, "email": pdf_a.get("email", ""), "country": self._infer_country(pdf_affil), @@ -115,7 +109,8 @@ def _infer_country(affiliation: str) -> str: if any(k in aff for k in cn_kw): return "CN" # US institutions - us_kw = ["mit ", "m.i.t", "stanford", "harvard", "berkeley", + us_kw = ["mit", "m.i.t", "massachusetts institute of technology", + "stanford", "harvard", "berkeley", "carnegie mellon", "cmu", "princeton", "yale", "columbia university", "cornell", "ucla", "caltech", "university of california", "university of michigan", @@ -163,18 +158,18 @@ def _infer_country(affiliation: str) -> str: @staticmethod def _name_keys(name: str) -> set: - """Extract all name variants for matching (same logic as scholar dedup).""" keys = set() cleaned = name.strip() if not cleaned: return keys - # Split on parentheses and slashes + parts = re.split(r'[()()//]', cleaned) for part in parts: - p = part.strip().strip(',,、').strip() - if p and len(p) >= 2: - keys.add(p.lower()) + part = part.strip().strip(',,、').strip() + if part and len(part) >= 2: + keys.update(name_keys(part)) + base = re.sub(r'[((].*?[))]', '', cleaned).strip() if base and len(base) >= 2: - keys.add(base.lower()) + keys.update(name_keys(base)) return keys diff --git a/citationclaw/core/author_name_utils.py b/citationclaw/core/author_name_utils.py new file mode 100644 index 0000000..9303794 --- /dev/null +++ b/citationclaw/core/author_name_utils.py @@ -0,0 +1,261 @@ +import re +import unicodedata + + +SPECIAL_CHAR_MAP = str.maketrans( + { + "Ł": "L", + "ł": "l", + "Đ": "D", + "đ": "d", + "Ø": "O", + "ø": "o", + } +) + + +def strip_accents(text: str) -> str: + normalized = unicodedata.normalize("NFKD", (text or "").translate(SPECIAL_CHAR_MAP)) + return "".join(ch for ch in normalized if not unicodedata.combining(ch)) + + +def _clean_token(text: str) -> str: + text = strip_accents(text or "") + text = text.lower() + text = re.sub(r"[^a-z0-9]+", " ", text) + return " ".join(text.split()) + + +def split_name_parts(name: str) -> tuple[str, list[str]]: + cleaned = strip_accents(name or "") + cleaned = re.sub(r"\s+", " ", cleaned).strip().strip(",") + if not cleaned: + return "", [] + + if "," in cleaned: + family, given = [part.strip() for part in cleaned.split(",", 1)] + given_parts = [part for part in re.split(r"\s+", given) if part] + return family, given_parts + + parts = [part for part in re.split(r"\s+", cleaned) if part] + if len(parts) == 1: + return parts[0], [] + return parts[-1], parts[:-1] + + +def format_wos_name(name: str) -> str: + family, given_parts = split_name_parts(name) + if not family: + return "" + family = re.sub(r"\s+", " ", family).strip() + given = " ".join(given_parts).strip() + return f"{family}, {given}".strip().strip(",") + + +def display_to_full_name(name: str) -> str: + family, given_parts = split_name_parts(name) + if not family: + return "" + if not given_parts: + return family + return " ".join([*given_parts, family]).strip() + + +def name_keys(name: str) -> set[str]: + full_name = display_to_full_name(name) + normalized_full = _clean_token(full_name) + if not normalized_full: + return set() + + parts = normalized_full.split() + family = parts[-1] + given = parts[:-1] + + keys = { + normalized_full, + family, + _clean_token(format_wos_name(name)), + } + + if given: + keys.add(f"{given[0]} {family}") + keys.add(f"{family} {given[0]}") + keys.add(f"{given[0][0]} {family}") + keys.add(f"{family} {given[0][0]}") + keys.add(f"{family} {' '.join(given)}") + if len(given) > 1: + initials = " ".join(part[0] for part in given if part) + if initials: + keys.add(f"{initials} {family}") + keys.add(f"{family} {initials}") + + return {key for key in keys if key} + + +# --------------------------------------------------------------------------- +# Enhanced matching (handles WOS abbreviated initials, inverted format, etc.) +# --------------------------------------------------------------------------- + +def _normalize_allcaps(name: str) -> str: + """'FU, DARWIN Y' → 'Fu, Darwin Y'. Only triggers when entire name is uppercase.""" + stripped = re.sub(r"[,.\s]", "", name) + if stripped.isalpha() and stripped.isupper() and len(stripped) > 2: + return name.title() + return name + + +def _is_initials_token(s: str) -> bool: + """Return True if s looks like initials: 1–4 uppercase-only alpha chars (e.g. 'K', 'KM', 'CA').""" + s = re.sub(r"[\s.]+", "", s) + return bool(s) and s.isalpha() and s.upper() == s and 1 <= len(s) <= 4 + + +def _expand_initials(token: str) -> list[str]: + """Split a given-name token into a list of single uppercase initials. + + 'KM' → ['K', 'M'] (WOS concatenated syllable initials for Chinese names) + 'K M' → ['K', 'M'] + 'K.' → ['K'] + 'Kaiming' → ['K'] (full name → just first char) + 'Christopher' → ['C'] + """ + token = token.strip().rstrip(".") + parts = [p.rstrip(".") for p in re.split(r"[\s.]+", token) if p.rstrip(".")] + if not parts: + return [] + if len(parts) > 1: + return [p[0].upper() for p in parts if p] + token = parts[0] + # Concatenated uppercase initials like "KM", "XY", "CA" + if token.isupper() and token.isalpha() and 2 <= len(token) <= 4: + return list(token) + return [token[0].upper()] + + +def _parse_for_match(name: str) -> tuple[str, list[str]]: + """Parse any name format → (family_normalized, [given_initials]).""" + name = _normalize_allcaps(name.strip()) + family, given_parts = split_name_parts(name) + + if not family: + return "", [] + + given_str = " ".join(given_parts) + if _is_initials_token(family) and given_str and len(given_str) > 3: + family, given_parts = given_str, [family] + + family_norm = _clean_token(strip_accents(family)) + + initials: list[str] = [] + for part in given_parts: + initials.extend(_expand_initials(part)) + + return family_norm, initials + + +def _parse_first_given(name: str) -> tuple[str, str]: + """Parse any name format → (family_norm, first_given_norm). + + Preserves the full first given-name token (not reduced to initial) so that + names_match can distinguish 'Christopher' from 'Carol' even though both start with C. + + 'He, Kaiming' → ('he', 'kaiming') + 'He, KM' → ('he', 'km') ← len<=2 → treated as initials downstream + 'Eger, T' → ('eger', 't') ← len=1 → initials + 'H, Melchinger' → ('melchinger', 'h') ← inverted + 'FU, DARWIN Y' → ('fu', 'darwin') + 'Manning, Christopher D.' → ('manning', 'christopher') + """ + name = _normalize_allcaps(name.strip()) + family, given_parts = split_name_parts(name) + + if not family: + return "", "" + + given_str = " ".join(given_parts) + if _is_initials_token(family) and given_str and len(given_str) > 3: + family, given_parts = given_str, [family] + + family_norm = _clean_token(strip_accents(family)) + first_given = _clean_token(strip_accents(given_parts[0])) if given_parts else "" + return family_norm, first_given + + +def names_match(name_a: str, name_b: str) -> bool: + """Return True if two name strings likely refer to the same person. + + Matching rules, applied in order: + + 1. Exact match after normalization. + 2. Family name must be identical. + 3. If either side has no given-name info → family match is sufficient. + 4. If either first given token is 'initials' (len ≤ 2, e.g. 'k', 'km', 't'): + → first character must match. + 'He, KM' matches 'He, Kaiming' (k == k) + 'Eger, T' matches 'Eger, Thomas' (t == t) + 5. Both sides have full given names (len ≥ 3): + → first 3 characters must match. + 'Manning, Christopher' does NOT match 'Manning, Carol' (chr ≠ car) + 'He, Kaiming' matches 'He, Kai' (kai == kai, prefix of length 3) + 'Li, Wei' does NOT match 'Li, Wenbo' (wei ≠ wen) + + Rule 5 prevents two distinct people with the same surname and same first + initial from being incorrectly merged. + """ + if not name_a or not name_b: + return False + + # Rule 1 + if _clean_token(strip_accents(name_a)) == _clean_token(strip_accents(name_b)): + return True + + fam_a, given_a = _parse_first_given(name_a) + fam_b, given_b = _parse_first_given(name_b) + + # Rule 2 + if not fam_a or not fam_b or fam_a != fam_b: + return False + + # Rule 3 + if not given_a or not given_b: + return True + + # Rules 4 & 5: initials = len ≤ 2 after clean_token + is_init_a = len(given_a) <= 2 + is_init_b = len(given_b) <= 2 + + if is_init_a or is_init_b: + # Rule 4: at least one abbreviated → first character must match + return given_a[0] == given_b[0] + + # Rule 5: both full names → 3-char prefix must match + prefix = min(len(given_a), len(given_b), 3) + return given_a[:prefix] == given_b[:prefix] + + +def to_natural_name(name: str) -> str: + """Convert any name format to natural 'First Last' (no comma). + + 'He, Kaiming' → 'Kaiming He' + 'He, KM' → 'KM He' + 'Manning, Christopher D.' → 'Christopher D. Manning' + 'H, Melchinger' → 'H Melchinger' (inverted detected: H is initial, Melchinger is surname) + 'FU, DARWIN Y' → 'Darwin Y Fu' (all-caps normalized) + 'Kaiming He' → 'Kaiming He' (already natural, no change) + """ + name = _normalize_allcaps(name.strip()) + family, given_parts = split_name_parts(name) + + if not family: + return name + + # Inverted detection + given_str = " ".join(given_parts) + if _is_initials_token(family) and given_str and len(given_str) > 3: + family, given_parts = given_str, [family] + + if not given_parts: + return family + + given = " ".join(given_parts) + return f"{given} {family}" diff --git a/citationclaw/core/author_searcher.py b/citationclaw/core/author_searcher.py index a19d270..33df9ea 100644 --- a/citationclaw/core/author_searcher.py +++ b/citationclaw/core/author_searcher.py @@ -5,6 +5,7 @@ from openai import AsyncOpenAI import httpx from citationclaw.core.author_cache import AuthorInfoCache +from citationclaw.core.structured_author_fetcher import StructuredAuthorFetcher class AuthorSearcher: @@ -27,6 +28,9 @@ def __init__( target_paper_authors: Optional[str] = None, author_cache: Optional[AuthorInfoCache] = None, cancel_event: Optional[asyncio.Event] = None, + wos_api_key: str = "", + s2_api_key: str = "", + mineru_api_token: str = "", ): """ 作者学术信息搜索器 @@ -91,6 +95,27 @@ def __init__( self.author_cache: Optional[AuthorInfoCache] = author_cache self.cancel_event: Optional[asyncio.Event] = cancel_event + # 结构化作者提取(WOS→S2→MinerU),有 wos_api_key 或 s2_api_key 时启用 + self.structured_fetcher: Optional[StructuredAuthorFetcher] = None + if wos_api_key or s2_api_key: + self.structured_fetcher = StructuredAuthorFetcher( + wos_api_key=wos_api_key, + s2_api_key=s2_api_key, + mineru_api_token=mineru_api_token, + openai_api_key=api_key, + openai_base_url=base_url, + model=model, + log_callback=log_callback, + ) + sources = [] + if wos_api_key: + sources.append("WOS") + if s2_api_key: + sources.append("S2") + log_callback(f"📋 结构化作者提取已启用:{'→'.join(sources)}→MinerU") + else: + log_callback("⚪ 结构化作者提取未启用(未配置 WOS/S2 API Key)") + # 自引检测 Prompt(使用轻量级模型) self.self_citation_check_prompt = ( "【任务】判断一篇施引论文是否为自引。\n" @@ -378,6 +403,30 @@ async def _search_single_paper( 'Authors_with_Profile': str(paper_content['authors']), } + # ── 结构化作者提取(WOS→S2→MinerU)───────────────────────────── + if self.structured_fetcher: + doi = paper_content.get('doi', '') + pdf_path = paper_content.get('pdf_path', None) + self.log_callback(f" 🔍 [结构化] 查询作者: {paper_title[:50]}...") + try: + struct_authors, struct_source = await self.structured_fetcher.fetch( + paper_title, doi=doi, pdf_path=pdf_path + ) + record_dict['Paper_Authors'] = struct_authors + record_dict['Paper_Authors_Source'] = struct_source + if struct_authors: + self.log_callback( + f" 📋 [{struct_source}] 找到 {len(struct_authors)} 位作者: {paper_title[:40]}..." + ) + else: + self.log_callback( + f" ⚪ [结构化] 未找到作者(WOS/S2 均无收录): {paper_title[:40]}..." + ) + except Exception as exc: + self.log_callback(f" ⚠️ 结构化作者提取失败: {exc}") + record_dict['Paper_Authors'] = [] + record_dict['Paper_Authors_Source'] = '' + # ── 查询缓存(取出已有字段供后续各步使用)──────────────────────── cached = (await self.author_cache.get(paper_link, paper_title)) if self.author_cache else None diff --git a/citationclaw/core/http_utils.py b/citationclaw/core/http_utils.py index 88a542a..14a51f7 100644 --- a/citationclaw/core/http_utils.py +++ b/citationclaw/core/http_utils.py @@ -18,14 +18,26 @@ def _detect_http_proxy() -> Optional[str]: return None +def _detect_ca_bundle() -> Optional[str]: + """Return custom CA bundle path if set via any of the common env vars.""" + for var in ["CA_BUNDLE_PATH", "SSL_CERT_FILE", "REQUESTS_CA_BUNDLE", "CURL_CA_BUNDLE"]: + val = os.environ.get(var, "").strip().strip('"').strip("'") + if val and os.path.exists(val): + return val + return None + + def make_async_client(timeout: float = 30.0) -> httpx.AsyncClient: - """Create an httpx AsyncClient with auto-detected proxy. + """Create an httpx AsyncClient with auto-detected proxy and CA bundle. Uses HTTP proxy if available, otherwise direct connection. + Respects CA_BUNDLE_PATH / SSL_CERT_FILE for corporate TLS interception. """ proxy = _detect_http_proxy() + ca_bundle = _detect_ca_bundle() return httpx.AsyncClient( proxy=proxy, timeout=timeout, + verify=ca_bundle if ca_bundle else True, headers={"User-Agent": "CitationClaw/2.0 (academic research tool)"}, ) diff --git a/citationclaw/core/pdf_author_extractor.py b/citationclaw/core/pdf_author_extractor.py index 7a7fe42..b8c0f7c 100644 --- a/citationclaw/core/pdf_author_extractor.py +++ b/citationclaw/core/pdf_author_extractor.py @@ -1,4 +1,4 @@ -"""Extract authors and affiliations from PDF first page using lightweight LLM.""" +"""Extract authors and affiliations from PDF content using lightweight LLM.""" import json from typing import List, Optional @@ -7,7 +7,7 @@ class PDFAuthorExtractor: - """Extract authors + affiliations from PDF first page via lightweight LLM.""" + """Extract authors + affiliations from PDF full content via lightweight LLM.""" def __init__(self, api_key: str = "", base_url: str = "", model: str = ""): self._api_key = api_key @@ -15,17 +15,17 @@ def __init__(self, api_key: str = "", base_url: str = "", model: str = ""): self._model = model self._prompt_loader = PromptLoader() - async def extract(self, first_page_blocks: list) -> List[dict]: - """Send first-page text blocks to lightweight LLM, return author list. + async def extract(self, blocks: list) -> List[dict]: + """Send PDF text blocks to lightweight LLM, return author list. Returns: [{"name": "...", "affiliation": "...", "email": "..."}] """ - if not self._api_key or not first_page_blocks: + if not self._api_key or not blocks: return [] # Build text from blocks lines = [] - for i, b in enumerate(first_page_blocks[:20]): + for i, b in enumerate(blocks): text = b.get("text", "").strip() if isinstance(b, dict) else str(b).strip() if text: lines.append(f"[{i}] {text}") diff --git a/citationclaw/core/pdf_mineru_parser.py b/citationclaw/core/pdf_mineru_parser.py index 21bc248..d69914f 100644 --- a/citationclaw/core/pdf_mineru_parser.py +++ b/citationclaw/core/pdf_mineru_parser.py @@ -115,6 +115,20 @@ def parse(self, pdf_path: Path, paper_key: str) -> Optional[dict]: return result return self._parse_pymupdf(pdf_path, output_dir) + @staticmethod + def _write_meta(output_dir: Path, source: str, parsed_at: str) -> None: + (output_dir / "meta.json").write_text( + json.dumps( + { + "source": source, + "parsed_at": parsed_at, + }, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + ) + @staticmethod def _get_page_count(pdf_path: Path) -> int: """Quick page count via PyMuPDF (no full parse).""" @@ -150,14 +164,26 @@ async def parse_async(self, pdf_path: Path, paper_key: str) -> Optional[dict]: - All cloud fail: Local MinerU (serial) → PyMuPDF """ output_dir = self._output_base / paper_key + if self._log: + self._log( + f" [MinerU] 开始解析 key={paper_key} file={pdf_path.name}" + ) # 1. Cache cached = self._load_cached(output_dir) if cached: + if self._log: + self._log( + f" [MinerU] 命中缓存 source={cached.get('source','?')} dir={output_dir}" + ) return cached file_size = pdf_path.stat().st_size if pdf_path.exists() else 0 page_count = self._get_page_count(pdf_path) + if self._log: + self._log( + f" [MinerU] 文件信息 size={file_size // 1024}KB pages={page_count}" + ) # Skip oversized PDFs (>100MB or >200 pages) if file_size > 100 * 1024 * 1024 or page_count > 200: @@ -169,22 +195,30 @@ async def parse_async(self, pdf_path: Path, paper_key: str) -> Optional[dict]: # 2. Cloud routing — try both tiers with smart fallthrough if is_large and self._mineru_token and not self._cloud_precision_failed: + if self._log: + self._log(" [MinerU] 路由: large PDF -> Cloud Precision first") # Large + has token → Precision first result = await self._parse_cloud_precision(pdf_path, output_dir) if result: return result # Precision failed → try Agent as fallback (may reject on size) if not self._cloud_agent_disabled: + if self._log: + self._log(" [MinerU] Cloud Precision 未成功,尝试 Cloud Agent") result = await self._parse_cloud_agent(pdf_path, output_dir) if result: return result elif is_large and not self._mineru_token: + if self._log: + self._log(" [MinerU] 路由: large PDF without token -> try Cloud Agent") # Large + no token → Agent will likely reject, try anyway then local if not self._cloud_agent_disabled: result = await self._parse_cloud_agent(pdf_path, output_dir) if result: return result else: + if self._log: + self._log(" [MinerU] 路由: small PDF -> Cloud Agent first") # Small file → Agent first if not self._cloud_agent_disabled: result = await self._parse_cloud_agent(pdf_path, output_dir) @@ -192,12 +226,16 @@ async def parse_async(self, pdf_path: Path, paper_key: str) -> Optional[dict]: return result # Agent failed → try Precision if token available if self._mineru_token and not self._cloud_precision_failed: + if self._log: + self._log(" [MinerU] Cloud Agent 未成功,尝试 Cloud Precision") result = await self._parse_cloud_precision(pdf_path, output_dir) if result: return result # 3. Local MinerU (serialized — only 1 at a time) if self._has_local_mineru and not self._local_mineru_failed: + if self._log: + self._log(" [MinerU] 尝试本地 MinerU") async with _local_mineru_lock: result = await asyncio.to_thread( self._parse_local_mineru, pdf_path, output_dir @@ -206,18 +244,34 @@ async def parse_async(self, pdf_path: Path, paper_key: str) -> Optional[dict]: return result # 4. PyMuPDF fallback + if self._log: + self._log(" [MinerU] 所有 MinerU 路径未成功,回退到 PyMuPDF") return self._parse_pymupdf(pdf_path, output_dir) # ── Cloud Agent API (free, ≤10MB/≤20 pages) ────────────────────────── @staticmethod def _make_direct_client(timeout: float = 120.0): - """Create httpx client WITHOUT proxy — MinerU is a China service, no proxy needed.""" + """Create direct httpx client while still honoring explicit CA bundle env vars. + + We intentionally keep ``trust_env=False`` to avoid inheriting system proxies + such as socks5/ALL_PROXY, but Cloudflare Gateway / enterprise TLS setups often + require a custom CA bundle passed via ``SSL_CERT_FILE`` or + ``REQUESTS_CA_BUNDLE``. httpx ignores those env vars when ``trust_env=False``, + so we wire them into ``verify`` manually. + """ import httpx + ca_bundle = None + for var in ["CA_BUNDLE_PATH", "SSL_CERT_FILE", "REQUESTS_CA_BUNDLE", "CURL_CA_BUNDLE"]: + val = (os.environ.get(var) or "").strip().strip('"').strip("'") + if val and os.path.exists(val): + ca_bundle = val + break return httpx.AsyncClient( proxy=None, trust_env=False, # Ignore ALL_PROXY / socks5 env vars timeout=timeout, + verify=ca_bundle if ca_bundle else True, headers={"User-Agent": "CitationClaw/2.0"}, ) @@ -289,14 +343,15 @@ async def _parse_cloud_agent(self, pdf_path: Path, output_dir: Path) -> Optional # Save to cache output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "full.md").write_text(md_text, encoding="utf-8") - + parsed_at = datetime.now(timezone.utc).isoformat() + self._write_meta(output_dir, "mineru_cloud_agent", parsed_at) return { "content_list": [], "full_md": md_text, "first_page_blocks": self._md_to_first_page(md_text), "references_md": self._extract_references(md_text), "source": "mineru_cloud_agent", - "parsed_at": datetime.now(timezone.utc).isoformat(), + "parsed_at": parsed_at, } except Exception as e: err = str(e)[:80] @@ -463,6 +518,8 @@ def _extract_from_zip(self, zip_bytes: bytes, output_dir: Path) -> Optional[dict if not md_text: return None + parsed_at = datetime.now(timezone.utc).isoformat() + self._write_meta(output_dir, "mineru_cloud_precision", parsed_at) return { "content_list": content_list, "full_md": md_text, @@ -472,7 +529,7 @@ def _extract_from_zip(self, zip_bytes: bytes, output_dir: Path) -> Optional[dict ), "references_md": self._extract_references(md_text), "source": "mineru_cloud_precision", - "parsed_at": datetime.now(timezone.utc).isoformat(), + "parsed_at": parsed_at, } # ── Local MinerU (needs GPU + models) ───────────────────────────────── @@ -520,6 +577,8 @@ def _parse_local_mineru(self, pdf_path: Path, output_dir: Path) -> Optional[dict # First successful local parse → symlink models to project for next time self._ensure_project_model_link() + parsed_at = datetime.now(timezone.utc).isoformat() + self._write_meta(output_dir, "mineru_local", parsed_at) return { "content_list": content_list, "full_md": md_text, @@ -528,7 +587,7 @@ def _parse_local_mineru(self, pdf_path: Path, output_dir: Path) -> Optional[dict ), "references_md": self._extract_references(md_text), "source": "mineru_local", - "parsed_at": datetime.now(timezone.utc).isoformat(), + "parsed_at": parsed_at, } except Exception as e: error_msg = str(e) @@ -594,13 +653,15 @@ def _parse_pymupdf(self, pdf_path: Path, output_dir: Path) -> Optional[dict]: output_dir.mkdir(parents=True, exist_ok=True) (output_dir / "full.md").write_text(full_text, encoding="utf-8") + parsed_at = datetime.now(timezone.utc).isoformat() + self._write_meta(output_dir, "pymupdf", parsed_at) return { "content_list": [], "full_md": full_text, "first_page_blocks": self._md_to_first_page(first_page), "references_md": self._extract_references(full_text), "source": "pymupdf", - "parsed_at": datetime.now(timezone.utc).isoformat(), + "parsed_at": parsed_at, } except Exception: return None @@ -619,10 +680,17 @@ def _load_cached(self, output_dir: Path) -> Optional[dict]: try: md_text = md_path.read_text(encoding="utf-8") content_list = [] + meta_path = output_dir / "meta.json" + source = "" + parsed_at = datetime.now(timezone.utc).isoformat() for f in output_dir.rglob("*content_list.json"): with open(f) as fh: content_list = json.load(fh) break + if meta_path.exists(): + meta = json.loads(meta_path.read_text(encoding="utf-8")) + source = meta.get("source", "") + parsed_at = meta.get("parsed_at", parsed_at) return { "content_list": content_list, "full_md": md_text, @@ -631,8 +699,8 @@ def _load_cached(self, output_dir: Path) -> Optional[dict]: if content_list else self._md_to_first_page(md_text) ), "references_md": self._extract_references(md_text), - "source": "mineru" if content_list else "pymupdf", - "parsed_at": datetime.now(timezone.utc).isoformat(), + "source": source or ("mineru" if content_list else "pymupdf"), + "parsed_at": parsed_at, } except Exception: return None diff --git a/citationclaw/core/scraper.py b/citationclaw/core/scraper.py index 3fcbd5f..d04f311 100644 --- a/citationclaw/core/scraper.py +++ b/citationclaw/core/scraper.py @@ -283,12 +283,12 @@ def _parse_citation_count(self, html: str) -> int: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') stat_patterns = [ - r'找到约\s*([\d,]+)\s*条', - r'获得\s*([\d,]+)\s*条', - r'约\s*([\d,]+)\s*条', - r'([\d,]+)\s*条结果', - r'About\s+([\d,]+)\s+results?', - r'^([\d,]+)\s+results?\b', + r'找到约\s*([\d,.\s]+?)\s*条', + r'获得\s*([\d,.\s]+?)\s*条', + r'约\s*([\d,.\s]+?)\s*条', + r'([\d,.\s]+?)\s*条结果', + r'About\s+([\d,.\s]+?)\s+results?', + r'^([\d,.\s]+?)\s+results?\b', ] candidates = [] id_elem = soup.find(id='gs_ab_mdw') @@ -301,25 +301,29 @@ def _parse_citation_count(self, html: str) -> int: for pat in stat_patterns: m = re.search(pat, stat_text, re.IGNORECASE) if m: - citation_count = int(m.group(1).replace(',', '')) - self.log_callback(f"🔍 结果统计元素文本: {stat_text[:100]}") - return citation_count + raw = re.sub(r'[,.\s]', '', m.group(1).strip()) + if raw.isdigit(): + citation_count = int(raw) + self.log_callback(f"🔍 结果统计元素文本: {stat_text[:100]}") + return citation_count except Exception: pass # 第二步:对整个 HTML 做正则(数字可能含 标签) patterns = [ - r'找到约\s*(?:<[^>]+>)?\s*([\d,]+)\s*(?:<[^>]+>)?\s*条', - r'获得\s*(?:<[^>]+>)?\s*([\d,]+)\s*(?:<[^>]+>)?\s*条', - r'约\s*(?:<[^>]+>)?\s*([\d,]+)\s*(?:<[^>]+>)?\s*条结果', - r'About\s+(?:<[^>]+>)?\s*([\d,]+)\s+results?', - r'([\d,]+)\s*条结果', - r'>(\d[\d,]*)\s+results?\b', + r'找到约\s*(?:<[^>]+>)?\s*([\d,.\s]+?)\s*(?:<[^>]+>)?\s*条', + r'获得\s*(?:<[^>]+>)?\s*([\d,.\s]+?)\s*(?:<[^>]+>)?\s*条', + r'约\s*(?:<[^>]+>)?\s*([\d,.\s]+?)\s*(?:<[^>]+>)?\s*条结果', + r'About\s+(?:<[^>]+>)?\s*([\d,.\s]+?)\s+results?', + r'([\d,.\s]+?)\s*条结果', + r'>(\d[\d,.\s]*?)\s+results?\b', ] for pattern in patterns: match = re.search(pattern, html, re.IGNORECASE) if match: - return int(match.group(1).replace(',', '')) + raw = re.sub(r'[,.\s]', '', match.group(1).strip()) + if raw.isdigit(): + return int(raw) return 0 diff --git a/citationclaw/core/structured_author_fetcher.py b/citationclaw/core/structured_author_fetcher.py new file mode 100644 index 0000000..adf8e9a --- /dev/null +++ b/citationclaw/core/structured_author_fetcher.py @@ -0,0 +1,310 @@ +"""Structured author fetching: WOS → S2 fallback → MinerU affiliation supplement.""" +from __future__ import annotations + +import asyncio +import re +import sys +from pathlib import Path +from typing import Any +from urllib.parse import urlencode + +import httpx + +from citationclaw.core.author_name_utils import format_wos_name, name_keys, to_natural_name +from citationclaw.core.s2_client import S2Client + + +_WOS_ENDPOINT = "https://api.clarivate.com/apis/wos-starter/v1/documents" + + +def _normalize_title(text: str) -> str: + import unicodedata + text = unicodedata.normalize("NFKD", text or "").lower() + text = re.sub(r"[^a-z0-9]+", " ", text) + return " ".join(text.split()) + + +def _wos_hit_authors(hit: dict) -> list[dict[str, Any]]: + names = hit.get("names", {}) or {} + authors = [] + for a in names.get("authors", []) or []: + raw = (a.get("wosStandard", "") or a.get("displayName", "")).strip() + name = to_natural_name(format_wos_name(raw) or raw) + if name: + authors.append({"name": name, "affiliation": "", "email": "", "source": "wos"}) + return authors + + +async def _query_wos( + api_key: str, + title: str, + doi: str, + *, + retries: int = 2, + retry_wait: float = 20.0, +) -> list[dict[str, Any]]: + """Query WOS Starter API; returns author list (empty on failure or not found).""" + if not api_key: + return [] + query = f"DO=({doi})" if doi else f'TI=("{title}")' + params = {"db": "WOS", "q": query, "limit": 5, "page": 1} + headers = {"X-ApiKey": api_key} + + for attempt in range(retries + 1): + try: + async with httpx.AsyncClient(timeout=60.0) as client: + resp = await client.get( + f"{_WOS_ENDPOINT}?{urlencode(params)}", headers=headers + ) + resp.raise_for_status() + hits = resp.json().get("hits", []) or [] + except httpx.HTTPStatusError as exc: + if exc.response.status_code == 429 and attempt < retries: + await asyncio.sleep(retry_wait * (attempt + 1)) + continue + return [] + except Exception: + if attempt < retries: + await asyncio.sleep(10.0) + continue + return [] + + if not hits: + return [] + # DOI match preferred; fallback to first title-equal hit + title_norm = _normalize_title(title) + best = None + for h in hits: + ids = h.get("identifiers", {}) or {} + if doi and (ids.get("doi") or "").lower() == doi.lower(): + best = h + break + if best is None: + for h in hits: + if _normalize_title(h.get("title", "")) == title_norm: + best = h + break + if best is None: + best = hits[0] + return _wos_hit_authors(best) + return [] + + +async def _query_s2(s2_client: S2Client, title: str) -> list[dict[str, Any]]: + result = await s2_client.search_paper(title) + if not result: + return [] + authors = [] + for a in result.get("authors", []): + raw = a.get("name", "") + name = to_natural_name(format_wos_name(raw) or raw) + if name: + authors.append({ + "name": name, + "affiliation": a.get("affiliation", ""), + "email": "", + "s2_id": a.get("s2_id", ""), + "source": "s2", + }) + return authors + + +def _merge_wos_s2( + wos_authors: list[dict[str, Any]], + s2_authors: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """Enrich WOS author list with S2 affiliation/s2_id via name matching. + + WOS is authoritative for the author list; S2 fills in missing affiliations. + Names are matched with names_match() which handles initials, accents, and + different formatting conventions (WOS abbreviated ↔ S2 full names). + """ + from citationclaw.core.author_name_utils import names_match + + if not s2_authors: + return wos_authors + if not wos_authors: + return s2_authors + + matched_s2_ids: set[int] = set() + merged: list[dict[str, Any]] = [] + + for wos_a in wos_authors: + enriched = dict(wos_a) + for s2_a in s2_authors: + if names_match(wos_a.get("name", ""), s2_a.get("name", "")): + matched_s2_ids.add(id(s2_a)) + if s2_a.get("affiliation") and not enriched.get("affiliation"): + enriched["affiliation"] = s2_a["affiliation"] + enriched["affiliation_source"] = "s2" + if s2_a.get("s2_id") and not enriched.get("s2_id"): + enriched["s2_id"] = s2_a["s2_id"] + if s2_a.get("openalex_id") and not enriched.get("openalex_id"): + enriched["openalex_id"] = s2_a["openalex_id"] + break + merged.append(enriched) + + return [a for a in merged if a.get("name")] + + +def _merge_with_pdf( + api_authors: list[dict[str, Any]], + pdf_authors: list[dict[str, Any]], +) -> list[dict[str, Any]]: + """Enrich api_authors with affiliations from pdf_authors; append unmatched PDF authors.""" + if not pdf_authors: + return api_authors + if not api_authors: + return [{"name": to_natural_name(a.get("name", "")), "affiliation": a.get("affiliation", ""), + "email": a.get("email", ""), "source": "pdf"} for a in pdf_authors] + + pdf_by_key: dict[str, dict] = {} + for a in pdf_authors: + for k in name_keys(a.get("name", "")): + pdf_by_key.setdefault(k, a) + + matched_ids: set[int] = set() + merged = [] + for api_a in api_authors: + enriched = dict(api_a) + match = None + for k in name_keys(api_a.get("name", "")): + if k in pdf_by_key: + match = pdf_by_key[k] + matched_ids.add(id(match)) + break + if match: + if match.get("affiliation") and not enriched.get("affiliation"): + enriched["affiliation"] = match["affiliation"] + if match.get("email") and not enriched.get("email"): + enriched["email"] = match["email"] + merged.append(enriched) + + for pdf_a in pdf_authors: + if id(pdf_a) not in matched_ids: + merged.append({ + "name": to_natural_name(pdf_a.get("name", "")), + "affiliation": pdf_a.get("affiliation", ""), + "email": pdf_a.get("email", ""), + "source": "pdf_only", + }) + return [a for a in merged if a.get("name")] + + +class StructuredAuthorFetcher: + """Fetch structured author list: WOS → S2 fallback → MinerU affiliation supplement. + + All returned names are in natural format ("First Last", no comma). + """ + + def __init__( + self, + wos_api_key: str = "", + s2_api_key: str = "", + mineru_api_token: str = "", + openai_api_key: str = "", + openai_base_url: str = "", + model: str = "", + pdf_cache_dir: str | Path | None = None, + log_callback=None, + ): + self._wos_key = wos_api_key + self._s2_client = S2Client(api_key=s2_api_key) if s2_api_key or True else None + self._mineru_token = mineru_api_token + self._openai_key = openai_api_key + self._openai_base = openai_base_url + self._model = model + self._pdf_cache_dir = Path(pdf_cache_dir) if pdf_cache_dir else None + self._log = log_callback or (lambda msg: None) + + async def fetch( + self, + title: str, + doi: str = "", + pdf_path: str | Path | None = None, + ) -> tuple[list[dict[str, Any]], str]: + """Return (author_list, source_label). + + source_label: "wos" | "s2" | "pdf" | "" (empty = no authors found) + """ + # Step 1: WOS + wos_authors: list[dict] = [] + if self._wos_key: + try: + wos_authors = await _query_wos(self._wos_key, title, doi) + except Exception as exc: + self._log(f"[StructuredAuthorFetcher] WOS error: {exc}") + + # Step 2: PDF (MinerU) — try to get affiliations from PDF + pdf_authors: list[dict] = [] + pdf_path_resolved = Path(pdf_path) if pdf_path else None + if pdf_path_resolved and pdf_path_resolved.exists() and self._openai_key: + try: + pdf_authors = await self._run_mineru(pdf_path_resolved, title, doi) + except Exception as exc: + self._log(f"[StructuredAuthorFetcher] MinerU error: {exc}") + + if wos_authors: + if pdf_authors: + # WOS + PDF: enrich WOS names with PDF affiliations + merged = _merge_with_pdf(wos_authors, pdf_authors) + self._log( + f"[StructuredAuthorFetcher] WOS {len(wos_authors)} + PDF {len(pdf_authors)}" + f" → merged {len(merged)} authors" + ) + return merged, "wos+pdf" + else: + # WOS succeeded but no PDF — fall back to S2 for affiliations + s2_authors: list[dict] = [] + if self._s2_client: + try: + s2_authors = await _query_s2(self._s2_client, title) + except Exception as exc: + self._log(f"[StructuredAuthorFetcher] S2 error: {exc}") + if s2_authors: + merged = _merge_wos_s2(wos_authors, s2_authors) + self._log( + f"[StructuredAuthorFetcher] WOS {len(wos_authors)} + S2 {len(s2_authors)}" + f" → merged {len(merged)} authors" + ) + return merged, "wos+s2" + else: + return wos_authors, "wos" + + # WOS failed — fall back to S2 + s2_authors: list[dict] = [] + if self._s2_client: + try: + s2_authors = await _query_s2(self._s2_client, title) + except Exception as exc: + self._log(f"[StructuredAuthorFetcher] S2 error: {exc}") + + if s2_authors: + if pdf_authors: + merged = _merge_with_pdf(s2_authors, pdf_authors) + return merged, "s2+pdf" + return s2_authors, "s2" + + if pdf_authors: + return [{"name": to_natural_name(a.get("name", "")), "affiliation": a.get("affiliation", ""), + "email": a.get("email", ""), "source": "pdf"} for a in pdf_authors], "pdf" + + return [], "" + + async def _run_mineru(self, pdf_path: Path, title: str, doi: str) -> list[dict]: + from citationclaw.core.pdf_mineru_parser import MinerUParser + from citationclaw.core.pdf_author_extractor import PDFAuthorExtractor + + cache_dir = self._pdf_cache_dir or (pdf_path.parent / ".pdf_parsed_cache") + parser = MinerUParser(output_base=cache_dir, mineru_api_token=self._mineru_token) + extractor = PDFAuthorExtractor( + api_key=self._openai_key, + base_url=self._openai_base, + model=self._model, + ) + paper_key = parser.paper_key({"doi": doi, "title": title or pdf_path.stem}) + parsed = await parser.parse_async(pdf_path, paper_key) + if not parsed: + return [] + blocks = parsed.get("content_list") or parsed.get("first_page_blocks", []) + return await extractor.extract(blocks) diff --git a/citationclaw/skills/phase2_author_intel.py b/citationclaw/skills/phase2_author_intel.py index d1dfdcf..7bc41df 100644 --- a/citationclaw/skills/phase2_author_intel.py +++ b/citationclaw/skills/phase2_author_intel.py @@ -45,6 +45,9 @@ async def _run_inner(self, ctx: SkillContext, **kwargs) -> SkillResult: target_paper_authors=target_paper_authors, author_cache=author_cache, cancel_event=quota_event, + wos_api_key=getattr(config, 'wos_api_key', ''), + s2_api_key=getattr(config, 's2_api_key', ''), + mineru_api_token=getattr(config, 'mineru_api_token', ''), ) await searcher.search( diff --git a/citationclaw/static/js/main.js b/citationclaw/static/js/main.js index 3aed5ba..9a6f9ab 100644 --- a/citationclaw/static/js/main.js +++ b/citationclaw/static/js/main.js @@ -663,6 +663,7 @@ function initIndexPage() { if (el('idx-service-tier')) el('idx-service-tier').value = cfg.service_tier || 'basic'; if (el('idx-dashboard-model')) el('idx-dashboard-model').value = cfg.dashboard_model || 'gemini-3-flash-preview-nothinking'; if (el('idx-s2-api-key')) el('idx-s2-api-key').value = cfg.s2_api_key || ''; + if (el('idx-wos-api-key')) el('idx-wos-api-key').value = cfg.wos_api_key || ''; if (el('idx-mineru-token')) el('idx-mineru-token').value = cfg.mineru_api_token || ''; if (el('idx-api-access-token')) el('idx-api-access-token').value = cfg.api_access_token || ''; if (el('idx-api-user-id')) el('idx-api-user-id').value = cfg.api_user_id || ''; @@ -709,6 +710,7 @@ function initIndexPage() { }[el('idx-service-tier')?.value || 'basic']), dashboard_model: el('idx-dashboard-model')?.value || '', s2_api_key: el('idx-s2-api-key')?.value || '', + wos_api_key: el('idx-wos-api-key')?.value || '', mineru_api_token: el('idx-mineru-token')?.value || '', api_access_token: el('idx-api-access-token')?.value || '', api_user_id: el('idx-api-user-id')?.value || '', @@ -721,6 +723,7 @@ function initIndexPage() { if (!body.api_access_token && existing.api_access_token) delete body.api_access_token; if (!body.api_user_id && existing.api_user_id) delete body.api_user_id; if (!body.s2_api_key && existing.s2_api_key) delete body.s2_api_key; + if (!body.wos_api_key && existing.wos_api_key) delete body.wos_api_key; if (!body.mineru_api_token && existing.mineru_api_token) delete body.mineru_api_token; const merged = Object.assign({}, existing, body); const resp = await safeFetch('/api/config', { diff --git a/citationclaw/templates/index.html b/citationclaw/templates/index.html index 55ff28c..a1ddd18 100644 --- a/citationclaw/templates/index.html +++ b/citationclaw/templates/index.html @@ -244,6 +244,10 @@

论文被引画像分析

+
+ + +