Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions verify-references/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
---
name: verify-references
description: Verify and validate bibliography references in BibTeX files by comparing with CrossRef metadata. Use when user asks to check, verify, or validate references.
---

When this skill is invoked, you must:

## Step 0: Pre-check

1. Ask user for:
- `bib_file` path (e.g., `Biblio/ref.bib`)
- `tex_dir` path (e.g., `Tex/`)

2. Generate report file path: `<bib_file_dir>/verification_report.md`

3. Check if report already exists:
- If exists: Use AskUserQuestion tool to ask "Found existing verification report at `<report_path>`. What would you like to do?" with options:
- "Re-run verification" (description: "Delete existing report and run full verification again")
- "View existing report" (description: "Read and summarize the existing report without re-running")
- If user chooses "View existing report": Read and summarize the existing report, then exit
- If user chooses "Re-run verification" or report doesn't exist: Continue to Step 1

## Step 1: Check Unused References

1. Run: `python ~/.claude/skills/verify-references/check_unused_refs.py <bib_file> <tex_dir>`

2. Parse JSON output and write to report file:
- Create report with header "# Bibliography Verification Report"
- Add section "## Unused References"
- List total references, cited count, uncited count
- List uncited citation keys

## Step 2: Download CrossRef Metadata

1. Generate output file path: `<bib_file_dir>/ref_crossref.bib`

2. Tell user: "Downloading CrossRef metadata. You can monitor progress in real-time by viewing `<full_path_to_ref_crossref.bib>`"

3. Run: `python -u ~/.claude/skills/verify-references/download_crossref.py <bib_file> <output_file>`
- Queries CrossRef API for all entries with DOIs
- Generates a CrossRef version of the bib file for comparison
- Maintains the same entry order as the original file
- **Monitor and relay progress**: As the script outputs progress lines like "[1/50] Querying key: DOI", relay these to the user in real-time so they can see the download progress

4. Report progress summary:
- Total entries processed
- Entries with DOIs
- Entries without DOIs (skipped)

## Step 3: Generate Comparison Report

1. Run: `python ~/.claude/skills/verify-references/compare_refs.py <bib_file> <crossref_bib> <report_file>`
- Use the same `<bib_file_dir>/verification_report.md` from Step 1
- Compares original bib file with CrossRef version
- Identifies discrepancies in metadata fields
- Calculates similarity scores
- Appends comparison results to the existing report

2. Parse and present the comparison section:
- Summary statistics
- Entries with no discrepancies
- Entries with discrepancies (sorted by similarity)
- Detailed field-by-field comparison table

## Step 4: Final Summary

1. Read the complete verification report from `<bib_file_dir>/verification_report.md`

2. Provide a concise summary to the user:
- Number of unused references
- Number of entries verified against CrossRef
- Number of entries with discrepancies
- Key issues found (if any)

3. Tell the user: "Detailed report saved to: `<full_path_to_verification_report.md>`"

79 changes: 79 additions & 0 deletions verify-references/check_unused_refs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
检查 .bib 文件中未被引用的参考文献
"""

import os
import re
import sys
import json
from collections import defaultdict

def extract_bib_keys(bib_file):
"""从 .bib 文件中提取所有参考文献的 citation keys"""
with open(bib_file, 'r', encoding='utf-8') as f:
content = f.read()
return re.findall(r'@\w+\{([^,\s]+),', content)

def count_citations_in_file(file_path, bib_keys):
"""统计单个 .tex 文件中每个参考文献的引用次数"""
citation_counts = defaultdict(int)
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()

# 移除注释行
content = re.sub(r'(?<!\\)%.*', '', content)

# 匹配 \cite 命令
matches = re.findall(r'\\cite[pns]*\{([^}]+)\}', content)
for match in matches:
for key in match.split(','):
key = key.strip()
if key in bib_keys:
citation_counts[key] += 1

return citation_counts

def main():
if len(sys.argv) != 3:
print(json.dumps({"error": "Usage: check_unused_refs.py <bib_file> <tex_dir>"}))
sys.exit(1)

bib_file = sys.argv[1]
tex_dir = sys.argv[2]

if not os.path.exists(bib_file):
print(json.dumps({"error": f"BIB file not found: {bib_file}"}))
sys.exit(1)

if not os.path.exists(tex_dir):
print(json.dumps({"error": f"TEX directory not found: {tex_dir}"}))
sys.exit(1)

# 提取参考文献 keys
bib_keys = extract_bib_keys(bib_file)

# 统计引用
total_citations = defaultdict(int)
for filename in os.listdir(tex_dir):
if filename.endswith('.tex'):
file_path = os.path.join(tex_dir, filename)
citations = count_citations_in_file(file_path, bib_keys)
for key, count in citations.items():
total_citations[key] += count

# 找出未被引用的文献
uncited = [key for key in bib_keys if key not in total_citations]

# 输出 JSON 结果
result = {
"total_refs": len(bib_keys),
"cited_refs": len(total_citations),
"uncited_refs": len(uncited),
"uncited_keys": uncited
}
print(json.dumps(result, ensure_ascii=False, indent=2))

if __name__ == "__main__":
main()
209 changes: 209 additions & 0 deletions verify-references/compare_refs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Compare bibliographic information between ref.bib and ref_.bib
"""

import sys
import os
import re
from difflib import SequenceMatcher

def parse_bib_entries(filepath):
"""Parse BibTeX file and extract entries with key fields"""
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
content = ''.join(lines)

entries = {}
pattern = r'@(\w+)\{([^,]+),([^@]*?)(?=\n@|\Z)'

for match in re.finditer(pattern, content, re.DOTALL):
key = match.group(2).strip()
body = match.group(3)

# Find line number
line_num = content[:match.start()].count('\n') + 1

entry = {'key': key, 'line': line_num}

# Extract fields
for field in ['title', 'author', 'journal', 'volume', 'number', 'pages', 'year', 'doi', 'url']:
# Try matching with quotes: field = "..."
field_pattern = rf'{field}\s*=\s*"([^"]*)"'
match = re.search(field_pattern, body, re.IGNORECASE | re.DOTALL)
if match:
value = match.group(1).strip()
# Remove outer braces if present
if value.startswith('{') and value.endswith('}'):
value = value[1:-1].strip()
entry[field] = value
continue

# Try matching with braces: field = {...}
field_pattern = rf'{field}\s*=\s*\{{'
match = re.search(field_pattern, body, re.IGNORECASE)
if match:
start = match.end()
brace_count = 1
i = start
while i < len(body) and brace_count > 0:
if body[i] == '{':
brace_count += 1
elif body[i] == '}':
brace_count -= 1
i += 1
value = body[start:i-1].strip()
# Remove outer braces if present
if value.startswith('{') and value.endswith('}'):
value = value[1:-1].strip()
entry[field] = value

entries[key] = entry

return entries

def normalize(text):
"""Normalize text for comparison"""
if not text:
return ""
# Remove extra spaces, convert to lowercase
text = re.sub(r'\s+', ' ', text.lower())
# Remove special LaTeX commands
text = re.sub(r'\\[a-z]+\{([^}]*)\}', r'\1', text)
text = re.sub(r'[{}]', '', text)
return text.strip()

def similarity(a, b):
"""Calculate similarity ratio between two strings"""
return SequenceMatcher(None, normalize(a), normalize(b)).ratio()

def compare_entries(orig, crossref):
"""Compare two entries and return differences"""
issues = []
details = {}
min_sim = 1.0

for field in ['title', 'author', 'journal', 'volume', 'number', 'pages', 'year']:
orig_val = orig.get(field, '')
cross_val = crossref.get(field, '')

if not orig_val and not cross_val:
continue

if not orig_val:
# Skip number field if missing in original
if field == 'number':
continue
issues.append(f" {field}: MISSING in original")
details[field] = {'orig': '', 'cross': cross_val, 'issue': 'MISSING', 'sim': 0.0}
min_sim = 0.0
continue

if not cross_val:
continue

sim = similarity(orig_val, cross_val)

if sim < 0.9:
issues.append(f" {field}:")
issues.append(f" Original: {orig_val}")
issues.append(f" CrossRef: {cross_val}")
issues.append(f" Similarity: {sim:.2%}")
details[field] = {'orig': orig_val, 'cross': cross_val, 'sim': sim}
min_sim = min(min_sim, sim)

return issues, details, min_sim

def main():
if len(sys.argv) != 4:
print(json.dumps({"error": "Usage: compare_refs.py <orig_bib> <crossref_bib> <output_report>"}))
sys.exit(1)

orig_file = sys.argv[1]
cross_file = sys.argv[2]
report_file = sys.argv[3]

if not os.path.exists(orig_file):
print(json.dumps({"error": f"Original file not found: {orig_file}"}))
sys.exit(1)

if not os.path.exists(cross_file):
print(json.dumps({"error": f"CrossRef file not found: {cross_file}"}))
sys.exit(1)

print("Parsing files...")
orig_entries = parse_bib_entries(orig_file)
cross_entries = parse_bib_entries(cross_file)

print(f"Found {len(orig_entries)} entries in ref.bib")
print(f"Found {len(cross_entries)} entries in ref_.bib\n")

print("=" * 80)
print("COMPARISON RESULTS")
print("=" * 80)

report_data = []
entries_with_doi = 0
entries_without_doi = 0

for key in sorted(orig_entries.keys()):
if key in cross_entries:
entries_with_doi += 1
issues, details, min_sim = compare_entries(orig_entries[key], cross_entries[key])
if issues:
print(f"\n[{key}]")
for issue in issues:
print(issue)
doi = orig_entries[key].get('doi', cross_entries[key].get('doi', ''))
url = orig_entries[key].get('url', cross_entries[key].get('url', ''))
report_data.append({'key': key, 'doi': doi, 'url': url, 'details': details, 'min_sim': min_sim})
else:
entries_without_doi += 1

entries_with_issues = len(report_data)
entries_no_issues = entries_with_doi - entries_with_issues

print("\n" + "=" * 80)
print(f"Total entries with discrepancies: {entries_with_issues}")
print("=" * 80)

# Sort by similarity (lowest first)
report_data.sort(key=lambda x: x['min_sim'])

# Generate markdown report
with open(report_file, 'a', encoding='utf-8') as f:
f.write("\n\n## CrossRef Metadata Comparison\n\n")
f.write("### Summary\n\n")
f.write(f"- **Total entries in ref.bib**: {len(orig_entries)}\n")
f.write(f"- **Entries with DOI (in ref_.bib)**: {entries_with_doi}\n")
f.write(f"- **Entries without DOI (skipped)**: {entries_without_doi}\n")
f.write(f"- **Entries with no discrepancies**: {entries_no_issues} ({entries_no_issues/entries_with_doi*100:.1f}%)\n")
f.write(f"- **Entries with discrepancies**: {entries_with_issues} ({entries_with_issues/entries_with_doi*100:.1f}%)\n\n")
f.write("### Detailed Comparison\n\n")
f.write("Entries are sorted by similarity (lowest first).\n\n")
f.write("| # | Citation Key | Field | Original | CrossRef | Similarity | URL |\n")
f.write("|---|--------------|-------|----------|----------|------------|-----|\n")

idx = 1
for item in report_data:
key = item['key']
line = orig_entries[key].get('line', 0)
doi = item['doi']
url = item['url']
link = f"https://doi.org/{doi}" if doi else url
key_md = f"[{key}](ref.bib#L{line})"
url_md = f"[url]({link})" if link else ""

for field, info in item['details'].items():
orig = info['orig'].replace('|', '\\|').replace('\n', ' ')[:60]
cross = info['cross'].replace('|', '\\|').replace('\n', ' ')[:60]
sim = info.get('sim', 0.0)
sim_str = "MISSING" if info.get('issue') == 'MISSING' else f"{sim:.0%}"
f.write(f"| {idx} | {key_md} | {field} | {orig} | {cross} | {sim_str} | {url_md} |\n")
idx += 1

print(f"\nMarkdown report saved to: {report_file}")

if __name__ == "__main__":
main()
Loading