forcedotcom · diksha-sf · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/src/datacustomcode/run.py b/src/datacustomcode/run.py
@@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None:
 
 
 def add_py_folder(entrypoint: str):
-    default_py_folder = "py-files"  # Hardcoded folder name
+    """Add py-files subfolder and entrypoint directory to sys.path.
+
+    This ensures:
+    1. py-files/ is available for additional dependencies
+    2. The entrypoint directory is available for local module imports
+    """
+    default_py_folder = "py-files"
     cwd = Path.cwd().joinpath(entrypoint)
-    py_folder = cwd.parent.joinpath(default_py_folder)
+    entrypoint_dir = cwd.parent
+    py_folder = entrypoint_dir.joinpath(default_py_folder)
+
+    # Add py-files folder if it exists
+    if py_folder.exists():
+        sys.path.insert(0, str(py_folder))
 
-    sys.path.append(str(py_folder))
+    # Add entrypoint directory to allow local module imports
+    sys.path.insert(0, str(entrypoint_dir))
diff --git a/src/datacustomcode/scan.py b/src/datacustomcode/scan.py
@@ -264,7 +264,17 @@ def scan_file_for_imports(file_path: str) -> Set[str]:
         tree = ast.parse(code)
         visitor = ImportVisitor()
         visitor.visit(tree)
-        return visitor.imports
+
+        # Filter out local modules
+        file_dir = os.path.dirname(file_path)
+        filtered_imports = set()
+        for package in visitor.imports:
+            # Check if a .py file exists in the same directory
+            local_module_path = os.path.join(file_dir, f"{package}.py")
+            if not os.path.exists(local_module_path):
+                filtered_imports.add(package)
+
+        return filtered_imports
 
 
 def write_requirements_file(file_path: str) -> str:

diff --git a/src/datacustomcode/templates/function/chunking/payload/entrypoint.py b/src/datacustomcode/templates/function/chunking/payload/entrypoint.py
@@ -1,5 +1,7 @@
 import logging
 
+from utility import extract_citations, split_text_into_chunks
+
 from datacustomcode.function import Runtime
 from datacustomcode.function.feature_types.chunking import (
     ChunkType,
@@ -15,80 +17,6 @@
 DEFAULT_MAX_CHUNK_SIZE = 50
 
 
-def split_text_into_chunks(text: str, max_size: int, overlap: int = 20):
-    """Split text into chunks with overlap, trying to break at natural boundaries.
-
-    Tries to break at natural boundaries in order of preference:
-    1. Paragraph boundaries (\\n\\n)
-    2. Line boundaries (\\n)
-    3. Sentence boundaries (. ! ?)
-    4. Word boundaries (space)
-    5. Hard cut if no good boundary found
-
-    Args:
-        text: Text to split
-        max_size: Maximum characters per chunk
-        overlap: Number of characters to overlap between chunks
-
-    Returns:
-        List of text chunks
-    """
-    if len(text) <= max_size:
-        return [text]
-
-    chunks = []
-    start = 0
-
-    while start < len(text):
-        # Determine end position for this chunk
-        end = start + max_size
-
-        if end >= len(text):
-            # Last chunk
-            chunks.append(text[start:])
-            break
-
-        # Try to find a good breaking point (in order of preference)
-        chunk_text = text[start:end]
-        break_point = None
-
-        # Try to break at paragraph boundary (\n\n)
-        last_paragraph = chunk_text.rfind("\n\n")
-        if last_paragraph > max_size * 0.5:  # Only if it's past halfway
-            break_point = start + last_paragraph + 2  # +2 to skip the \n\n
-
-        # Try to break at line boundary (\n)
-        if break_point is None:
-            last_newline = chunk_text.rfind("\n")
-            if last_newline > max_size * 0.5:
-                break_point = start + last_newline + 1
-
-        # Try to break at sentence boundary (. ! ?)
-        if break_point is None:
-            for punct in [". ", "! ", "? "]:
-                last_sentence = chunk_text.rfind(punct)
-                if last_sentence > max_size * 0.5:
-                    break_point = start + last_sentence + len(punct)
-                    break
-
-        # Try to break at word boundary (space)
-        if break_point is None:
-            last_space = chunk_text.rfind(" ")
-            if last_space > max_size * 0.5:
-                break_point = start + last_space + 1
-
-        # If no good breaking point, just hard cut
-        if break_point is None:
-            break_point = end
-
-        chunks.append(text[start:break_point].strip())
-
-        # Move start position with overlap
-        start = max(break_point - overlap, start + 1)
-
-    return chunks
-
-
 def function(
     request: SearchIndexChunkingV1Request, runtime: Runtime
 ) -> SearchIndexChunkingV1Response:
@@ -121,11 +49,7 @@ def function(
 
         # Create chunk outputs
         for chunk_text in text_chunks:
-            # Create citations from source_dmo_fields if available
-            citations = {}
-            if metadata and metadata.source_dmo_fields:
-                for key, value in metadata.source_dmo_fields.items():
-                    citations[key] = str(value)
+            citations = extract_citations(metadata)
 
             chunk_output = SearchIndexChunkingV1Output(
                 chunk_type=ChunkType.TEXT,

diff --git a/src/datacustomcode/templates/function/chunking/payload/utility.py b/src/datacustomcode/templates/function/chunking/payload/utility.py
@@ -0,0 +1,104 @@
+"""Utility functions for text chunking operations."""
+
+import logging
+from typing import (
+    Dict,
+    List,
+    Optional,
+)
+
+from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata
+
+logger = logging.getLogger(__name__)
+
+
+def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]:
+    """Split text into chunks with overlap, trying to break at natural boundaries.
+
+    Tries to break at natural boundaries in order of preference:
+    1. Paragraph boundaries (\\n\\n)
+    2. Line boundaries (\\n)
+    3. Sentence boundaries (. ! ?)
+    4. Word boundaries (space)
+    5. Hard cut if no good boundary found
+
+    Args:
+        text: Text to split
+        max_size: Maximum characters per chunk
+        overlap: Number of characters to overlap between chunks
+
+    Returns:
+        List of text chunks
+    """
+    if len(text) <= max_size:
+        return [text]
+
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        # Determine end position for this chunk
+        end = start + max_size
+
+        if end >= len(text):
+            # Last chunk
+            chunks.append(text[start:])
+            break
+
+        # Try to find a good breaking point (in order of preference)
+        chunk_text = text[start:end]
+        break_point = None
+
+        # Try to break at paragraph boundary (\n\n)
+        last_paragraph = chunk_text.rfind("\n\n")
+        if last_paragraph > max_size * 0.5:  # Only if it's past halfway
+            break_point = start + last_paragraph + 2  # +2 to skip the \n\n
+
+        # Try to break at line boundary (\n)
+        if break_point is None:
+            last_newline = chunk_text.rfind("\n")
+            if last_newline > max_size * 0.5:
+                break_point = start + last_newline + 1
+
+        # Try to break at sentence boundary (. ! ?)
+        if break_point is None:
+            for punct in [". ", "! ", "? "]:
+                last_sentence = chunk_text.rfind(punct)
+                if last_sentence > max_size * 0.5:
+                    break_point = start + last_sentence + len(punct)
+                    break
+
+        # Try to break at word boundary (space)
+        if break_point is None:
+            last_space = chunk_text.rfind(" ")
+            if last_space > max_size * 0.5:
+                break_point = start + last_space + 1
+
+        # If no good breaking point, just hard cut
+        if break_point is None:
+            break_point = end
+
+        chunks.append(text[start:break_point].strip())
+
+        # Move start position with overlap
+        start = max(break_point - overlap, start + 1)
+
+    return chunks
+
+
+def extract_citations(
+    metadata: Optional[SearchIndexChunkingV1Metadata],
+) -> Dict[str, str]:
+    """Extract citations from document metadata.
+
+    Args:
+        metadata: Document metadata containing source DMO fields
+
+    Returns:
+        Dictionary of citation key-value pairs
+    """
+    citations = {}
+    if metadata and metadata.source_dmo_fields:
+        for key, value in metadata.source_dmo_fields.items():
+            citations[key] = str(value)
+    return citations
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -237,6 +237,48 @@ def test_run_entrypoint_with_dependencies():
             sys.path.remove(module_dir)
 
 
+def test_add_py_folder_enables_local_imports():
+    """Test that add_py_folder adds entrypoint directory to sys.path."""
+    from datacustomcode.run import add_py_folder
+
+    # Create a temporary directory structure
+    temp_dir = tempfile.mkdtemp()
+
+    try:
+        # Create a utility module in the temp directory
+        utility_path = os.path.join(temp_dir, "utility.py")
+        with open(utility_path, "w") as f:
+            f.write("TEST_VALUE = 'local_module_works'\n")
+
+        # Create an entrypoint file
+        entrypoint_path = os.path.join(temp_dir, "entrypoint.py")
+        with open(entrypoint_path, "w") as f:
+            f.write("# Test entrypoint\n")
+
+        # Save original sys.path
+        original_path = sys.path.copy()
+
+        # Call add_py_folder with relative path from current directory
+        relative_entrypoint = os.path.relpath(entrypoint_path)
+        add_py_folder(relative_entrypoint)
+
+        # verify we can now import the utility module
+        import utility
+
+        assert hasattr(utility, "TEST_VALUE"), "utility module should have TEST_VALUE"
+        assert (
+            utility.TEST_VALUE == "local_module_works"
+        ), f"Expected 'local_module_works', got {utility.TEST_VALUE}"
+
+    finally:
+        # Cleanup
+        sys.path = original_path
+        if "utility" in sys.modules:
+            del sys.modules["utility"]
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+
+
 class TestDataspaceScenarios:
     """Test dataspace functionality in run_entrypoint."""
 

diff --git a/tests/test_scan.py b/tests/test_scan.py
@@ -927,3 +927,65 @@ def test_excluded_packages(self):
             assert "pyspark" not in imports
         finally:
             os.unlink(temp_path)
+
+    def test_local_module_exclusion(self):
+        """Test that local modules (files in the same directory) are excluded."""
+        # Create a temporary directory with multiple Python files
+        temp_dir = tempfile.mkdtemp()
+
+        try:
+            # Create a local module file
+            utility_path = os.path.join(temp_dir, "utility.py")
+            with open(utility_path, "w") as f:
+                f.write(
+                    textwrap.dedent(
+                        """
+                    def helper_function():
+                        return "helper"
+                    """
+                    )
+                )
+
+            # Create another local module
+            helpers_path = os.path.join(temp_dir, "helpers.py")
+            with open(helpers_path, "w") as f:
+                f.write(
+                    textwrap.dedent(
+                        """
+                    def another_helper():
+                        return "another"
+                    """
+                    )
+                )
+
+            # script imports both local modules and external packages
+            main_content = textwrap.dedent(
+                """
+                from utility import helper_function
+                from helpers import another_helper
+                import pandas as pd
+                import numpy as np
+                """
+            )
+            main_path = os.path.join(temp_dir, "main.py")
+            with open(main_path, "w") as f:
+                f.write(main_content)
+
+            # Scan for imports
+            imports = scan_file_for_imports(main_path)
+
+            # External packages should be included
+            assert "pandas" in imports
+            assert "numpy" in imports
+
+            # Local modules should be excluded
+            assert "utility" not in imports
+            assert "helpers" not in imports
+
+        finally:
+            # Clean up
+            for file in ["utility.py", "helpers.py", "main.py"]:
+                file_path = os.path.join(temp_dir, file)
+                if os.path.exists(file_path):
+                    os.unlink(file_path)
+            os.rmdir(temp_dir)