diff --git a/README.md b/README.md index 3fd8c6f7..17858b92 100644 --- a/README.md +++ b/README.md @@ -386,7 +386,7 @@ Code Context is a monorepo containing three main packages: - **Embedding Providers**: [OpenAI](https://openai.com), [VoyageAI](https://voyageai.com), [Ollama](https://ollama.ai), [Gemini](https://gemini.google.com) - **Vector Databases**: [Milvus](https://milvus.io) or [Zilliz Cloud](https://zilliz.com/cloud)(fully managed vector database as a service) - **Code Splitters**: AST-based splitter (with automatic fallback), LangChain character-based splitter -- **Languages**: TypeScript, JavaScript, Python, Java, C++, C#, Go, Rust, PHP, Ruby, Swift, Kotlin, Scala, Markdown +- **Languages**: TypeScript, JavaScript, Python, Java, C++, C#, Go, Rust, Zig, PHP, Ruby, Swift, Kotlin, Scala, Markdown - **Development Tools**: VSCode, Model Context Protocol --- diff --git a/package.json b/package.json index 8f2611e8..13d0c729 100644 --- a/package.json +++ b/package.json @@ -52,5 +52,8 @@ "url": "https://github.com/zilliztech/code-context.git" }, "license": "MIT", - "author": "Cheney Zhang <277584121@qq.com>" + "author": "Cheney Zhang <277584121@qq.com>", + "dependencies": { + "@tree-sitter-grammars/tree-sitter-zig": "^1.1.2" + } } \ No newline at end of file diff --git a/packages/core/README.md b/packages/core/README.md index 3f20c796..f77fbc61 100644 --- a/packages/core/README.md +++ b/packages/core/README.md @@ -94,7 +94,7 @@ results.forEach(result => { ## Features -- **Multi-language Support**: Index TypeScript, JavaScript, Python, Java, C++, and many other programming languages +- **Multi-language Support**: Index TypeScript, JavaScript, Python, Java, C++, Zig, and many other programming languages - **Semantic Search**: Find code using natural language queries powered by AI embeddings - **Flexible Architecture**: Pluggable embedding providers and vector databases - **Smart Chunking**: Intelligent code splitting that preserves context and structure @@ -136,7 +136,7 @@ interface CodeContextConfig { [ // Programming languages '.ts', '.tsx', '.js', '.jsx', '.py', '.java', '.cpp', '.c', '.h', '.hpp', - '.cs', '.go', '.rs', '.php', '.rb', '.swift', '.kt', '.scala', '.m', '.mm', + '.cs', '.go', '.rs', '.zig', '.php', '.rb', '.swift', '.kt', '.scala', '.m', '.mm', // Text and markup files '.md', '.markdown' ] diff --git a/packages/core/package.json b/packages/core/package.json index db3613fa..11a48044 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -14,6 +14,7 @@ }, "dependencies": { "@google/genai": "^1.9.0", + "@tree-sitter-grammars/tree-sitter-zig": "^1.1.2", "@zilliz/milvus2-sdk-node": "^2.5.10", "faiss-node": "^0.5.1", "fs-extra": "^11.0.0", diff --git a/packages/core/src/context.ts b/packages/core/src/context.ts index cc5c9aa4..9cf5d8c4 100644 --- a/packages/core/src/context.ts +++ b/packages/core/src/context.ts @@ -23,7 +23,7 @@ import { FileSynchronizer } from './sync/synchronizer'; const DEFAULT_SUPPORTED_EXTENSIONS = [ // Programming languages '.ts', '.tsx', '.js', '.jsx', '.py', '.java', '.cpp', '.c', '.h', '.hpp', - '.cs', '.go', '.rs', '.php', '.rb', '.swift', '.kt', '.scala', '.m', '.mm', + '.cs', '.go', '.rs', '.php', '.rb', '.swift', '.kt', '.scala', '.m', '.mm', '.zig', // Text and markup files '.md', '.markdown', '.ipynb', // '.txt', '.json', '.yaml', '.yml', '.xml', '.html', '.htm', @@ -626,7 +626,8 @@ export class CodeContext { '.scala': 'scala', '.m': 'objective-c', '.mm': 'objective-c', - '.ipynb': 'jupyter' + '.ipynb': 'jupyter', + '.zig': 'zig' }; return languageMap[ext] || 'text'; } diff --git a/packages/core/src/splitter/ast-splitter.ts b/packages/core/src/splitter/ast-splitter.ts index c3944d24..cb57ea7a 100644 --- a/packages/core/src/splitter/ast-splitter.ts +++ b/packages/core/src/splitter/ast-splitter.ts @@ -9,6 +9,7 @@ const Java = require('tree-sitter-java'); const Cpp = require('tree-sitter-cpp'); const Go = require('tree-sitter-go'); const Rust = require('tree-sitter-rust'); +const Zig = require('@tree-sitter-grammars/tree-sitter-zig'); // Node types that represent logical code units const SPLITTABLE_NODE_TYPES = { @@ -18,7 +19,8 @@ const SPLITTABLE_NODE_TYPES = { java: ['method_declaration', 'class_declaration', 'interface_declaration', 'constructor_declaration'], cpp: ['function_definition', 'class_specifier', 'namespace_definition', 'declaration'], go: ['function_declaration', 'method_declaration', 'type_declaration', 'var_declaration', 'const_declaration'], - rust: ['function_item', 'impl_item', 'struct_item', 'enum_item', 'trait_item', 'mod_item'] + rust: ['function_item', 'impl_item', 'struct_item', 'enum_item', 'trait_item', 'mod_item'], + zig: ['function_declaration', 'variable_declaration', 'test_declaration', 'comptime_declaration', 'using_namespace_declaration'] }; export class AstCodeSplitter implements Splitter { @@ -93,7 +95,8 @@ export class AstCodeSplitter implements Splitter { 'c': { parser: Cpp, nodeTypes: SPLITTABLE_NODE_TYPES.cpp }, 'go': { parser: Go, nodeTypes: SPLITTABLE_NODE_TYPES.go }, 'rust': { parser: Rust, nodeTypes: SPLITTABLE_NODE_TYPES.rust }, - 'rs': { parser: Rust, nodeTypes: SPLITTABLE_NODE_TYPES.rust } + 'rs': { parser: Rust, nodeTypes: SPLITTABLE_NODE_TYPES.rust }, + 'zig': { parser: Zig, nodeTypes: SPLITTABLE_NODE_TYPES.zig } }; return langMap[language.toLowerCase()] || null; @@ -109,9 +112,18 @@ export class AstCodeSplitter implements Splitter { const chunks: CodeChunk[] = []; const codeLines = code.split('\n'); - const traverse = (currentNode: Parser.SyntaxNode) => { + // For Zig and similar languages, only extract top-level declarations + // to avoid duplicating nested functions/types + const shouldOnlyExtractTopLevel = ['zig', 'rust', 'go'].includes(language); + + const traverse = (currentNode: Parser.SyntaxNode, depth: number = 0) => { // Check if this node type should be split into a chunk if (splittableTypes.includes(currentNode.type)) { + // For certain languages, skip nested declarations + if (shouldOnlyExtractTopLevel && depth > 1) { + return; + } + const startLine = currentNode.startPosition.row + 1; const endLine = currentNode.endPosition.row + 1; const nodeText = code.slice(currentNode.startIndex, currentNode.endIndex); @@ -132,7 +144,7 @@ export class AstCodeSplitter implements Splitter { // Continue traversing child nodes for (const child of currentNode.children) { - traverse(child); + traverse(child, depth + 1); } }; @@ -256,7 +268,7 @@ export class AstCodeSplitter implements Splitter { static isLanguageSupported(language: string): boolean { const supportedLanguages = [ 'javascript', 'js', 'typescript', 'ts', 'python', 'py', - 'java', 'cpp', 'c++', 'c', 'go', 'rust', 'rs' + 'java', 'cpp', 'c++', 'c', 'go', 'rust', 'rs', 'zig' ]; return supportedLanguages.includes(language.toLowerCase()); } diff --git a/packages/core/src/splitter/langchain-splitter.ts b/packages/core/src/splitter/langchain-splitter.ts index 9c10c4f2..a07f66df 100644 --- a/packages/core/src/splitter/langchain-splitter.ts +++ b/packages/core/src/splitter/langchain-splitter.ts @@ -63,7 +63,7 @@ export class LangChainCodeSplitter implements Splitter { private mapLanguage(language: string): SupportedLanguage | null { // Map common language names to LangChain supported formats - const languageMap: Record = { + const languageMap: Record = { 'javascript': 'js', 'typescript': 'js', 'python': 'python', @@ -84,9 +84,11 @@ export class LangChainCodeSplitter implements Splitter { 'tex': 'latex', 'solidity': 'sol', 'sol': 'sol', + 'zig': null, // LangChain doesn't have native Zig support, will use fallback }; - return languageMap[language.toLowerCase()] || null; + const mapped = languageMap[language.toLowerCase()]; + return mapped !== undefined ? mapped : null; } private async fallbackSplit(code: string, language: string, filePath?: string): Promise { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b7527b78..1a3ed07e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -7,6 +7,10 @@ settings: importers: .: + dependencies: + '@tree-sitter-grammars/tree-sitter-zig': + specifier: ^1.1.2 + version: 1.1.2(tree-sitter@0.21.1) devDependencies: '@types/node': specifier: ^20.0.0 @@ -163,6 +167,9 @@ importers: '@google/genai': specifier: ^1.9.0 version: 1.9.0(@modelcontextprotocol/sdk@1.12.1) + '@tree-sitter-grammars/tree-sitter-zig': + specifier: ^1.1.2 + version: 1.1.2(tree-sitter@0.21.1) '@zilliz/milvus2-sdk-node': specifier: ^2.5.10 version: 2.5.10 @@ -1047,6 +1054,14 @@ packages: '@textlint/types@14.8.4': resolution: {integrity: sha512-9nyY8vVXlr8hHKxa6+37omJhXWCwovMQcgMteuldYd4dOxGm14AK2nXdkgtKEUQnzLGaXy46xwLCfhQy7V7/YA==} + '@tree-sitter-grammars/tree-sitter-zig@1.1.2': + resolution: {integrity: sha512-J0L31HZ2isy3F5zb2g5QWQOv2r/pbruQNL9ADhuQv2pn5BQOzxt80WcEJaYXBeuJ8GHxVT42slpCna8k1c8LOw==} + peerDependencies: + tree-sitter: ^0.22.1 + peerDependenciesMeta: + tree-sitter: + optional: true + '@tybys/wasm-util@0.9.0': resolution: {integrity: sha512-6+7nlbMVX/PVDCwaIQ8nTOPveOcFLSt8GcXdx8hD0bt39uWxYT88uXzqTd4fTvqta7oeUJqudepapKNt2DYJFw==} @@ -5314,6 +5329,13 @@ snapshots: dependencies: '@textlint/ast-node-types': 14.8.4 + '@tree-sitter-grammars/tree-sitter-zig@1.1.2(tree-sitter@0.21.1)': + dependencies: + node-addon-api: 8.4.0 + node-gyp-build: 4.8.4 + optionalDependencies: + tree-sitter: 0.21.1 + '@tybys/wasm-util@0.9.0': dependencies: tslib: 2.8.1