From cbaf63575e83350eb06ed9aec4afffce5a689941 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Sat, 13 Sep 2025 12:02:14 +0000 Subject: [PATCH 1/2] feat(context): add git repository identifier support for consistent collection naming --- packages/core/src/context.ts | 48 +++++++++++++++++++++++++++--------- packages/mcp/src/handlers.ts | 48 ++++++++++++++++++++++++++++++------ packages/mcp/src/index.ts | 11 +++++++-- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/packages/core/src/context.ts b/packages/core/src/context.ts index 1af13058..ede26627 100644 --- a/packages/core/src/context.ts +++ b/packages/core/src/context.ts @@ -213,8 +213,8 @@ export class Context { /** * Public wrapper for prepareCollection private method */ - async getPreparedCollection(codebasePath: string): Promise { - return this.prepareCollection(codebasePath); + async getPreparedCollection(codebasePath: string, gitRepoIdentifier?: string | null): Promise { + return this.prepareCollection(codebasePath, false, gitRepoIdentifier); } /** @@ -230,12 +230,31 @@ export class Context { /** * Generate collection name based on codebase path and hybrid mode + * Optionally accepts a git repository identifier for consistent naming across different local paths */ - public getCollectionName(codebasePath: string): string { + public getCollectionName(codebasePath: string, gitRepoIdentifier?: string | null): string { const isHybrid = this.getIsHybrid(); + const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks'; + + // If git repository identifier is provided, use it for collection naming + if (gitRepoIdentifier) { + // Create a clean identifier by replacing special characters + const cleanIdentifier = gitRepoIdentifier + .replace(/[^a-zA-Z0-9]/g, '_') // Replace non-alphanumeric with underscore + .toLowerCase() + .substring(0, 32); // Limit length for collection name + + // Create hash from the git identifier for uniqueness + const hash = crypto.createHash('md5').update(gitRepoIdentifier).digest('hex'); + + console.log(`[Context] Using git-based collection naming for: ${gitRepoIdentifier}`); + return `${prefix}_git_${cleanIdentifier}_${hash.substring(0, 8)}`; + } + + // Fallback to path-based naming (original behavior) const normalizedPath = path.resolve(codebasePath); const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); - const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks'; + console.log(`[Context] Using path-based collection naming for: ${normalizedPath}`); return `${prefix}_${hash.substring(0, 8)}`; } @@ -405,13 +424,15 @@ export class Context { * @param query Search query * @param topK Number of results to return * @param threshold Similarity threshold + * @param filterExpr Optional filter expression + * @param gitRepoIdentifier Optional git repository identifier for consistent collection naming */ - async semanticSearch(codebasePath: string, query: string, topK: number = 5, threshold: number = 0.5, filterExpr?: string): Promise { + async semanticSearch(codebasePath: string, query: string, topK: number = 5, threshold: number = 0.5, filterExpr?: string, gitRepoIdentifier?: string | null): Promise { const isHybrid = this.getIsHybrid(); const searchType = isHybrid === true ? 'hybrid search' : 'semantic search'; console.log(`[Context] ๐Ÿ” Executing ${searchType}: "${query}" in ${codebasePath}`); - const collectionName = this.getCollectionName(codebasePath); + const collectionName = this.getCollectionName(codebasePath, gitRepoIdentifier); console.log(`[Context] ๐Ÿ” Using collection: ${collectionName}`); // Check if collection exists and has data @@ -518,10 +539,11 @@ export class Context { /** * Check if index exists for codebase * @param codebasePath Codebase path to check + * @param gitRepoIdentifier Optional git repository identifier for consistent collection naming * @returns Whether index exists */ - async hasIndex(codebasePath: string): Promise { - const collectionName = this.getCollectionName(codebasePath); + async hasIndex(codebasePath: string, gitRepoIdentifier?: string | null): Promise { + const collectionName = this.getCollectionName(codebasePath, gitRepoIdentifier); return await this.vectorDatabase.hasCollection(collectionName); } @@ -529,16 +551,18 @@ export class Context { * Clear index * @param codebasePath Codebase path to clear index for * @param progressCallback Optional progress callback function + * @param gitRepoIdentifier Optional git repository identifier for consistent collection naming */ async clearIndex( codebasePath: string, - progressCallback?: (progress: { phase: string; current: number; total: number; percentage: number }) => void + progressCallback?: (progress: { phase: string; current: number; total: number; percentage: number }) => void, + gitRepoIdentifier?: string | null ): Promise { console.log(`[Context] ๐Ÿงน Cleaning index data for ${codebasePath}...`); progressCallback?.({ phase: 'Checking existing index...', current: 0, total: 100, percentage: 0 }); - const collectionName = this.getCollectionName(codebasePath); + const collectionName = this.getCollectionName(codebasePath, gitRepoIdentifier); const collectionExists = await this.vectorDatabase.hasCollection(collectionName); progressCallback?.({ phase: 'Removing index data...', current: 50, total: 100, percentage: 50 }); @@ -622,11 +646,11 @@ export class Context { /** * Prepare vector collection */ - private async prepareCollection(codebasePath: string, forceReindex: boolean = false): Promise { + private async prepareCollection(codebasePath: string, forceReindex: boolean = false, gitRepoIdentifier?: string | null): Promise { const isHybrid = this.getIsHybrid(); const collectionType = isHybrid === true ? 'hybrid vector' : 'vector'; console.log(`[Context] ๐Ÿ”ง Preparing ${collectionType} collection for codebase: ${codebasePath}${forceReindex ? ' (FORCE REINDEX)' : ''}`); - const collectionName = this.getCollectionName(codebasePath); + const collectionName = this.getCollectionName(codebasePath, gitRepoIdentifier); // Check if collection already exists const collectionExists = await this.vectorDatabase.hasCollection(collectionName); diff --git a/packages/mcp/src/handlers.ts b/packages/mcp/src/handlers.ts index 1530d0c3..e0a96f94 100644 --- a/packages/mcp/src/handlers.ts +++ b/packages/mcp/src/handlers.ts @@ -3,7 +3,12 @@ import * as path from "path"; import * as crypto from "crypto"; import { Context, COLLECTION_LIMIT_MESSAGE } from "@zilliz/claude-context-core"; import { SnapshotManager } from "./snapshot.js"; -import { ensureAbsolutePath, truncateContent, trackCodebasePath } from "./utils.js"; +import { + ensureAbsolutePath, + truncateContent, + trackCodebasePath, + getRepositoryIdentifier +} from "./utils.js"; export class ToolHandlers { private context: Context; @@ -199,8 +204,14 @@ export class ToolHandlers { }; } + // Get git repository identifier for consistent collection naming + const gitRepoIdentifier = getRepositoryIdentifier(absolutePath); + if (gitRepoIdentifier) { + console.log(`[INDEX-VALIDATION] ๐Ÿ”— Git repository detected: ${gitRepoIdentifier}`); + } + //Check if the snapshot and cloud index are in sync - if (this.snapshotManager.getIndexedCodebases().includes(absolutePath) !== await this.context.hasIndex(absolutePath)) { + if (this.snapshotManager.getIndexedCodebases().includes(absolutePath) !== await this.context.hasIndex(absolutePath, gitRepoIdentifier)) { console.warn(`[INDEX-VALIDATION] โŒ Snapshot and cloud index mismatch: ${absolutePath}`); } @@ -221,9 +232,9 @@ export class ToolHandlers { console.log(`[FORCE-REINDEX] ๐Ÿ”„ Removing '${absolutePath}' from indexed list for re-indexing`); this.snapshotManager.removeIndexedCodebase(absolutePath); } - if (await this.context.hasIndex(absolutePath)) { + if (await this.context.hasIndex(absolutePath, gitRepoIdentifier)) { console.log(`[FORCE-REINDEX] ๐Ÿ”„ Clearing index for '${absolutePath}'`); - await this.context.clearIndex(absolutePath); + await this.context.clearIndex(absolutePath, undefined, gitRepoIdentifier); } } @@ -339,6 +350,14 @@ export class ToolHandlers { console.warn(`[BACKGROUND-INDEX] Non-AST splitter '${splitterType}' requested; falling back to AST splitter`); } + // Get git repository identifier if available + const gitRepoIdentifier = getRepositoryIdentifier(absolutePath); + if (gitRepoIdentifier) { + console.log(`[BACKGROUND-INDEX] ๐Ÿ”— Git repository detected: ${gitRepoIdentifier}`); + } else { + console.log(`[BACKGROUND-INDEX] ๐Ÿ“ Using path-based identification (not a git repository or no remote)`); + } + // Load ignore patterns from files first (including .ignore, .gitignore, etc.) await this.context.getLoadedIgnorePatterns(absolutePath); @@ -350,8 +369,8 @@ export class ToolHandlers { await synchronizer.initialize(); // Store synchronizer in the context (let context manage collection names) - await this.context.getPreparedCollection(absolutePath); - const collectionName = this.context.getCollectionName(absolutePath); + await this.context.getPreparedCollection(absolutePath, gitRepoIdentifier); + const collectionName = this.context.getCollectionName(absolutePath, gitRepoIdentifier); this.context.setSynchronizer(collectionName, synchronizer); if (contextForThisTask !== this.context) { contextForThisTask.setSynchronizer(collectionName, synchronizer); @@ -447,6 +466,12 @@ export class ToolHandlers { trackCodebasePath(absolutePath); + // Get git repository identifier if available for consistent collection naming + const gitRepoIdentifier = getRepositoryIdentifier(absolutePath); + if (gitRepoIdentifier) { + console.log(`[SEARCH] ๐Ÿ”— Git repository detected: ${gitRepoIdentifier}`); + } + // Check if this codebase is indexed or being indexed const isIndexed = this.snapshotManager.getIndexedCodebases().includes(absolutePath); const isIndexing = this.snapshotManager.getIndexingCodebases().includes(absolutePath); @@ -500,7 +525,8 @@ export class ToolHandlers { query, Math.min(resultLimit, 50), 0.3, - filterExpr + filterExpr, + gitRepoIdentifier ); console.log(`[SEARCH] โœ… Search completed! Found ${searchResults.length} results using ${embeddingProvider.getProvider()} embeddings`); @@ -621,10 +647,16 @@ export class ToolHandlers { }; } + // Get git repository identifier for consistent collection naming + const gitRepoIdentifier = getRepositoryIdentifier(absolutePath); + if (gitRepoIdentifier) { + console.log(`[CLEAR] ๐Ÿ”— Git repository detected: ${gitRepoIdentifier}`); + } + console.log(`[CLEAR] Clearing codebase: ${absolutePath}`); try { - await this.context.clearIndex(absolutePath); + await this.context.clearIndex(absolutePath, undefined, gitRepoIdentifier); console.log(`[CLEAR] Successfully cleared index for: ${absolutePath}`); } catch (error: any) { const errorMsg = `Failed to clear ${absolutePath}: ${error.message}`; diff --git a/packages/mcp/src/index.ts b/packages/mcp/src/index.ts index 8c4c3b28..08316699 100644 --- a/packages/mcp/src/index.ts +++ b/packages/mcp/src/index.ts @@ -88,6 +88,12 @@ Index a codebase directory to enable semantic search using a configurable code s โš ๏ธ **IMPORTANT**: - You MUST provide an absolute path to the target codebase. +- If the path is a git repository with a remote URL, it will automatically use the git remote for consistent collection naming across different local paths. + +๐Ÿ”— **Git Repository Support**: +- Automatically detects git repositories and uses remote URL for collection naming +- Same repository cloned to different paths will share the same collection +- Ensures consistency across team members and machines โœจ **Usage Guidance**: - This tool is typically used when search fails due to an unindexed codebase. @@ -100,6 +106,7 @@ Search the indexed codebase using natural language queries within a specified ab โš ๏ธ **IMPORTANT**: - You MUST provide an absolute path. +- If the path is a git repository, it will automatically use the correct collection based on the git remote URL. ๐ŸŽฏ **When to Use**: This tool is versatile and can be used before completing various tasks to retrieve relevant context: @@ -195,7 +202,7 @@ This tool is versatile and can be used before completing various tasks to retrie }, { name: "clear_index", - description: `Clear the search index. IMPORTANT: You MUST provide an absolute path.`, + description: `Clear the search index. IMPORTANT: You MUST provide an absolute path. Git repositories will be identified by their remote URL for accurate clearing.`, inputSchema: { type: "object", properties: { @@ -209,7 +216,7 @@ This tool is versatile and can be used before completing various tasks to retrie }, { name: "get_indexing_status", - description: `Get the current indexing status of a codebase. Shows progress percentage for actively indexing codebases and completion status for indexed codebases.`, + description: `Get the current indexing status of a codebase. Shows progress percentage for actively indexing codebases and completion status for indexed codebases. Git repositories are identified by their remote URL.`, inputSchema: { type: "object", properties: { From cc6686a0caf0c05ca7c3cba73aa7bff9cddfff26 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Sat, 13 Sep 2025 12:04:30 +0000 Subject: [PATCH 2/2] feat(context): add git repository identifier support for consistent collection naming --- packages/mcp/src/utils.ts | 101 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/packages/mcp/src/utils.ts b/packages/mcp/src/utils.ts index 64389e2d..3affc0e2 100644 --- a/packages/mcp/src/utils.ts +++ b/packages/mcp/src/utils.ts @@ -1,4 +1,6 @@ import * as path from "path"; +import { execSync } from "child_process"; +import * as fs from "fs"; /** * Truncate content to specified length @@ -27,4 +29,103 @@ export function ensureAbsolutePath(inputPath: string): string { export function trackCodebasePath(codebasePath: string): void { const absolutePath = ensureAbsolutePath(codebasePath); console.log(`[TRACKING] Tracked codebase path: ${absolutePath} (not marked as indexed)`); +} + +/** + * Check if a directory is a git repository + */ +export function isGitRepository(dirPath: string): boolean { + try { + const gitDir = path.join(dirPath, '.git'); + return fs.existsSync(gitDir); + } catch { + return false; + } +} + +/** + * Extract git remote URL from a repository path + * @param repoPath Path to the git repository + * @returns Git remote URL or null if not a git repo or no remote + */ +export function extractGitRemoteUrl(repoPath: string): string | null { + try { + if (!isGitRepository(repoPath)) { + return null; + } + + // Try to get the origin remote URL + const result = execSync('git remote get-url origin', { + cwd: repoPath, + encoding: 'utf8', + stdio: ['ignore', 'pipe', 'ignore'] // Ignore stderr to suppress git errors + }).trim(); + + return result || null; + } catch { + // If no origin remote or git command fails, return null + return null; + } +} + +/** + * Parse and normalize a git URL to a standard identifier + * Handles various formats: + * - https://github.com/org/repo.git + * - git@github.com:org/repo.git + * - https://gitlab.com/org/repo + * + * @param gitUrl The git remote URL + * @returns Normalized identifier like "github.com/org/repo" + */ +export function parseGitUrl(gitUrl: string): string | null { + try { + // Remove trailing whitespace + gitUrl = gitUrl.trim(); + + // Handle SSH format (git@github.com:org/repo.git) + const sshMatch = gitUrl.match(/^git@([^:]+):(.+?)(\.git)?$/); + if (sshMatch) { + const host = sshMatch[1]; + const path = sshMatch[2]; + return `${host}/${path}`; + } + + // Handle HTTPS format (https://github.com/org/repo.git) + const httpsMatch = gitUrl.match(/^https?:\/\/([^\/]+)\/(.+?)(\.git)?$/); + if (httpsMatch) { + const host = httpsMatch[1]; + const path = httpsMatch[2]; + return `${host}/${path}`; + } + + // If no match, return null + return null; + } catch { + return null; + } +} + +/** + * Get a repository identifier from a path + * First tries to use git remote URL, falls back to path-based identifier + * + * @param dirPath Directory path + * @returns Repository identifier or null + */ +export function getRepositoryIdentifier(dirPath: string): string | null { + // Try to get git remote URL + const gitUrl = extractGitRemoteUrl(dirPath); + + if (gitUrl) { + const identifier = parseGitUrl(gitUrl); + if (identifier) { + console.log(`[GIT-UTILS] Repository identified via git remote: ${identifier}`); + return identifier; + } + } + + // If not a git repo or parsing fails, return null + // The caller will handle the fallback to path-based identification + return null; } \ No newline at end of file