QuantumBFS · ZihangL · Mar 23, 2026
diff --git a/verify-references/SKILL.md b/verify-references/SKILL.md
@@ -0,0 +1,76 @@
+---
+name: verify-references
+description: Verify and validate bibliography references in BibTeX files by comparing with CrossRef metadata. Use when user asks to check, verify, or validate references.
+---
+
+When this skill is invoked, you must:
+
+## Step 0: Pre-check
+
+1. Ask user for:
+   - `bib_file` path (e.g., `Biblio/ref.bib`)
+   - `tex_dir` path (e.g., `Tex/`)
+
+2. Generate report file path: `<bib_file_dir>/verification_report.md`
+
+3. Check if report already exists:
+   - If exists: Use AskUserQuestion tool to ask "Found existing verification report at `<report_path>`. What would you like to do?" with options:
+     - "Re-run verification" (description: "Delete existing report and run full verification again")
+     - "View existing report" (description: "Read and summarize the existing report without re-running")
+   - If user chooses "View existing report": Read and summarize the existing report, then exit
+   - If user chooses "Re-run verification" or report doesn't exist: Continue to Step 1
+
+## Step 1: Check Unused References
+
+1. Run: `python ~/.claude/skills/verify-references/check_unused_refs.py <bib_file> <tex_dir>`
+
+2. Parse JSON output and write to report file:
+   - Create report with header "# Bibliography Verification Report"
+   - Add section "## Unused References"
+   - List total references, cited count, uncited count
+   - List uncited citation keys
+
+## Step 2: Download CrossRef Metadata
+
+1. Generate output file path: `<bib_file_dir>/ref_crossref.bib`
+
+2. Tell user: "Downloading CrossRef metadata. You can monitor progress in real-time by viewing `<full_path_to_ref_crossref.bib>`"
+
+3. Run: `python -u ~/.claude/skills/verify-references/download_crossref.py <bib_file> <output_file>`
+   - Queries CrossRef API for all entries with DOIs
+   - Generates a CrossRef version of the bib file for comparison
+   - Maintains the same entry order as the original file
+   - **Monitor and relay progress**: As the script outputs progress lines like "[1/50] Querying key: DOI", relay these to the user in real-time so they can see the download progress
+
+4. Report progress summary:
+   - Total entries processed
+   - Entries with DOIs
+   - Entries without DOIs (skipped)
+
+## Step 3: Generate Comparison Report
+
+1. Run: `python ~/.claude/skills/verify-references/compare_refs.py <bib_file> <crossref_bib> <report_file>`
+   - Use the same `<bib_file_dir>/verification_report.md` from Step 1
+   - Compares original bib file with CrossRef version
+   - Identifies discrepancies in metadata fields
+   - Calculates similarity scores
+   - Appends comparison results to the existing report
+
+2. Parse and present the comparison section:
+   - Summary statistics
+   - Entries with no discrepancies
+   - Entries with discrepancies (sorted by similarity)
+   - Detailed field-by-field comparison table
+
+## Step 4: Final Summary
+
+1. Read the complete verification report from `<bib_file_dir>/verification_report.md`
+
+2. Provide a concise summary to the user:
+   - Number of unused references
+   - Number of entries verified against CrossRef
+   - Number of entries with discrepancies
+   - Key issues found (if any)
+
+3. Tell the user: "Detailed report saved to: `<full_path_to_verification_report.md>`"
+
diff --git a/verify-references/check_unused_refs.py b/verify-references/check_unused_refs.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+检查 .bib 文件中未被引用的参考文献
+"""
+
+import os
+import re
+import sys
+import json
+from collections import defaultdict
+
+def extract_bib_keys(bib_file):
+    """从 .bib 文件中提取所有参考文献的 citation keys"""
+    with open(bib_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return re.findall(r'@\w+\{([^,\s]+),', content)
+
+def count_citations_in_file(file_path, bib_keys):
+    """统计单个 .tex 文件中每个参考文献的引用次数"""
+    citation_counts = defaultdict(int)
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # 移除注释行
+    content = re.sub(r'(?<!\\)%.*', '', content)
+
+    # 匹配 \cite 命令
+    matches = re.findall(r'\\cite[pns]*\{([^}]+)\}', content)
+    for match in matches:
+        for key in match.split(','):
+            key = key.strip()
+            if key in bib_keys:
+                citation_counts[key] += 1
+
+    return citation_counts
+
+def main():
+    if len(sys.argv) != 3:
+        print(json.dumps({"error": "Usage: check_unused_refs.py <bib_file> <tex_dir>"}))
+        sys.exit(1)
+
+    bib_file = sys.argv[1]
+    tex_dir = sys.argv[2]
+
+    if not os.path.exists(bib_file):
+        print(json.dumps({"error": f"BIB file not found: {bib_file}"}))
+        sys.exit(1)
+
+    if not os.path.exists(tex_dir):
+        print(json.dumps({"error": f"TEX directory not found: {tex_dir}"}))
+        sys.exit(1)
+
+    # 提取参考文献 keys
+    bib_keys = extract_bib_keys(bib_file)
+
+    # 统计引用
+    total_citations = defaultdict(int)
+    for filename in os.listdir(tex_dir):
+        if filename.endswith('.tex'):
+            file_path = os.path.join(tex_dir, filename)
+            citations = count_citations_in_file(file_path, bib_keys)
+            for key, count in citations.items():
+                total_citations[key] += count
+
+    # 找出未被引用的文献
+    uncited = [key for key in bib_keys if key not in total_citations]
+
+    # 输出 JSON 结果
+    result = {
+        "total_refs": len(bib_keys),
+        "cited_refs": len(total_citations),
+        "uncited_refs": len(uncited),
+        "uncited_keys": uncited
+    }
+    print(json.dumps(result, ensure_ascii=False, indent=2))
+
+if __name__ == "__main__":
+    main()
diff --git a/verify-references/compare_refs.py b/verify-references/compare_refs.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Compare bibliographic information between ref.bib and ref_.bib
+"""
+
+import sys
+import os
+import re
+from difflib import SequenceMatcher
+
+def parse_bib_entries(filepath):
+    """Parse BibTeX file and extract entries with key fields"""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+        content = ''.join(lines)
+
+    entries = {}
+    pattern = r'@(\w+)\{([^,]+),([^@]*?)(?=\n@|\Z)'
+
+    for match in re.finditer(pattern, content, re.DOTALL):
+        key = match.group(2).strip()
+        body = match.group(3)
+
+        # Find line number
+        line_num = content[:match.start()].count('\n') + 1
+
+        entry = {'key': key, 'line': line_num}
+
+        # Extract fields
+        for field in ['title', 'author', 'journal', 'volume', 'number', 'pages', 'year', 'doi', 'url']:
+            # Try matching with quotes: field = "..."
+            field_pattern = rf'{field}\s*=\s*"([^"]*)"'
+            match = re.search(field_pattern, body, re.IGNORECASE | re.DOTALL)
+            if match:
+                value = match.group(1).strip()
+                # Remove outer braces if present
+                if value.startswith('{') and value.endswith('}'):
+                    value = value[1:-1].strip()
+                entry[field] = value
+                continue
+
+            # Try matching with braces: field = {...}
+            field_pattern = rf'{field}\s*=\s*\{{'
+            match = re.search(field_pattern, body, re.IGNORECASE)
+            if match:
+                start = match.end()
+                brace_count = 1
+                i = start
+                while i < len(body) and brace_count > 0:
+                    if body[i] == '{':
+                        brace_count += 1
+                    elif body[i] == '}':
+                        brace_count -= 1
+                    i += 1
+                value = body[start:i-1].strip()
+                # Remove outer braces if present
+                if value.startswith('{') and value.endswith('}'):
+                    value = value[1:-1].strip()
+                entry[field] = value
+
+        entries[key] = entry
+
+    return entries
+
+def normalize(text):
+    """Normalize text for comparison"""
+    if not text:
+        return ""
+    # Remove extra spaces, convert to lowercase
+    text = re.sub(r'\s+', ' ', text.lower())
+    # Remove special LaTeX commands
+    text = re.sub(r'\\[a-z]+\{([^}]*)\}', r'\1', text)
+    text = re.sub(r'[{}]', '', text)
+    return text.strip()
+
+def similarity(a, b):
+    """Calculate similarity ratio between two strings"""
+    return SequenceMatcher(None, normalize(a), normalize(b)).ratio()
+
+def compare_entries(orig, crossref):
+    """Compare two entries and return differences"""
+    issues = []
+    details = {}
+    min_sim = 1.0
+
+    for field in ['title', 'author', 'journal', 'volume', 'number', 'pages', 'year']:
+        orig_val = orig.get(field, '')
+        cross_val = crossref.get(field, '')
+
+        if not orig_val and not cross_val:
+            continue
+
+        if not orig_val:
+            # Skip number field if missing in original
+            if field == 'number':
+                continue
+            issues.append(f"  {field}: MISSING in original")
+            details[field] = {'orig': '', 'cross': cross_val, 'issue': 'MISSING', 'sim': 0.0}
+            min_sim = 0.0
+            continue
+
+        if not cross_val:
+            continue
+
+        sim = similarity(orig_val, cross_val)
+
+        if sim < 0.9:
+            issues.append(f"  {field}:")
+            issues.append(f"    Original:  {orig_val}")
+            issues.append(f"    CrossRef:  {cross_val}")
+            issues.append(f"    Similarity: {sim:.2%}")
+            details[field] = {'orig': orig_val, 'cross': cross_val, 'sim': sim}
+            min_sim = min(min_sim, sim)
+
+    return issues, details, min_sim
+
+def main():
+    if len(sys.argv) != 4:
+        print(json.dumps({"error": "Usage: compare_refs.py <orig_bib> <crossref_bib> <output_report>"}))
+        sys.exit(1)
+
+    orig_file = sys.argv[1]
+    cross_file = sys.argv[2]
+    report_file = sys.argv[3]
+
+    if not os.path.exists(orig_file):
+        print(json.dumps({"error": f"Original file not found: {orig_file}"}))
+        sys.exit(1)
+
+    if not os.path.exists(cross_file):
+        print(json.dumps({"error": f"CrossRef file not found: {cross_file}"}))
+        sys.exit(1)
+
+    print("Parsing files...")
+    orig_entries = parse_bib_entries(orig_file)
+    cross_entries = parse_bib_entries(cross_file)
+
+    print(f"Found {len(orig_entries)} entries in ref.bib")
+    print(f"Found {len(cross_entries)} entries in ref_.bib\n")
+
+    print("=" * 80)
+    print("COMPARISON RESULTS")
+    print("=" * 80)
+
+    report_data = []
+    entries_with_doi = 0
+    entries_without_doi = 0
+
+    for key in sorted(orig_entries.keys()):
+        if key in cross_entries:
+            entries_with_doi += 1
+            issues, details, min_sim = compare_entries(orig_entries[key], cross_entries[key])
+            if issues:
+                print(f"\n[{key}]")
+                for issue in issues:
+                    print(issue)
+                doi = orig_entries[key].get('doi', cross_entries[key].get('doi', ''))
+                url = orig_entries[key].get('url', cross_entries[key].get('url', ''))
+                report_data.append({'key': key, 'doi': doi, 'url': url, 'details': details, 'min_sim': min_sim})
+        else:
+            entries_without_doi += 1
+
+    entries_with_issues = len(report_data)
+    entries_no_issues = entries_with_doi - entries_with_issues
+
+    print("\n" + "=" * 80)
+    print(f"Total entries with discrepancies: {entries_with_issues}")
+    print("=" * 80)
+
+    # Sort by similarity (lowest first)
+    report_data.sort(key=lambda x: x['min_sim'])
+
+    # Generate markdown report
+    with open(report_file, 'a', encoding='utf-8') as f:
+        f.write("\n\n## CrossRef Metadata Comparison\n\n")
+        f.write("### Summary\n\n")
+        f.write(f"- **Total entries in ref.bib**: {len(orig_entries)}\n")
+        f.write(f"- **Entries with DOI (in ref_.bib)**: {entries_with_doi}\n")
+        f.write(f"- **Entries without DOI (skipped)**: {entries_without_doi}\n")
+        f.write(f"- **Entries with no discrepancies**: {entries_no_issues} ({entries_no_issues/entries_with_doi*100:.1f}%)\n")
+        f.write(f"- **Entries with discrepancies**: {entries_with_issues} ({entries_with_issues/entries_with_doi*100:.1f}%)\n\n")
+        f.write("### Detailed Comparison\n\n")
+        f.write("Entries are sorted by similarity (lowest first).\n\n")
+        f.write("| # | Citation Key | Field | Original | CrossRef | Similarity | URL |\n")
+        f.write("|---|--------------|-------|----------|----------|------------|-----|\n")
+
+        idx = 1
+        for item in report_data:
+            key = item['key']
+            line = orig_entries[key].get('line', 0)
+            doi = item['doi']
+            url = item['url']
+            link = f"https://doi.org/{doi}" if doi else url
+            key_md = f"[{key}](ref.bib#L{line})"
+            url_md = f"[url]({link})" if link else ""
+
+            for field, info in item['details'].items():
+                orig = info['orig'].replace('|', '\\|').replace('\n', ' ')[:60]
+                cross = info['cross'].replace('|', '\\|').replace('\n', ' ')[:60]
+                sim = info.get('sim', 0.0)
+                sim_str = "MISSING" if info.get('issue') == 'MISSING' else f"{sim:.0%}"
+                f.write(f"| {idx} | {key_md} | {field} | {orig} | {cross} | {sim_str} | {url_md} |\n")
+            idx += 1
+
+    print(f"\nMarkdown report saved to: {report_file}")
+
+if __name__ == "__main__":
+    main()