Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions src/datacustomcode/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,20 @@ def run_function_with_test(entrypoint: str, test_file: str) -> None:


def add_py_folder(entrypoint: str):
default_py_folder = "py-files" # Hardcoded folder name
"""Add py-files subfolder and entrypoint directory to sys.path.

This ensures:
1. py-files/ is available for additional dependencies
2. The entrypoint directory is available for local module imports
"""
default_py_folder = "py-files"
cwd = Path.cwd().joinpath(entrypoint)
py_folder = cwd.parent.joinpath(default_py_folder)
entrypoint_dir = cwd.parent
py_folder = entrypoint_dir.joinpath(default_py_folder)

# Add py-files folder if it exists
if py_folder.exists():
sys.path.insert(0, str(py_folder))

sys.path.append(str(py_folder))
# Add entrypoint directory to allow local module imports
sys.path.insert(0, str(entrypoint_dir))
12 changes: 11 additions & 1 deletion src/datacustomcode/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,17 @@ def scan_file_for_imports(file_path: str) -> Set[str]:
tree = ast.parse(code)
visitor = ImportVisitor()
visitor.visit(tree)
return visitor.imports

# Filter out local modules
file_dir = os.path.dirname(file_path)
filtered_imports = set()
for package in visitor.imports:
# Check if a .py file exists in the same directory
local_module_path = os.path.join(file_dir, f"{package}.py")
if not os.path.exists(local_module_path):
filtered_imports.add(package)

return filtered_imports


def write_requirements_file(file_path: str) -> str:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

from utility import extract_citations, split_text_into_chunks

from datacustomcode.function import Runtime
from datacustomcode.function.feature_types.chunking import (
ChunkType,
Expand All @@ -15,80 +17,6 @@
DEFAULT_MAX_CHUNK_SIZE = 50


def split_text_into_chunks(text: str, max_size: int, overlap: int = 20):
"""Split text into chunks with overlap, trying to break at natural boundaries.

Tries to break at natural boundaries in order of preference:
1. Paragraph boundaries (\\n\\n)
2. Line boundaries (\\n)
3. Sentence boundaries (. ! ?)
4. Word boundaries (space)
5. Hard cut if no good boundary found

Args:
text: Text to split
max_size: Maximum characters per chunk
overlap: Number of characters to overlap between chunks

Returns:
List of text chunks
"""
if len(text) <= max_size:
return [text]

chunks = []
start = 0

while start < len(text):
# Determine end position for this chunk
end = start + max_size

if end >= len(text):
# Last chunk
chunks.append(text[start:])
break

# Try to find a good breaking point (in order of preference)
chunk_text = text[start:end]
break_point = None

# Try to break at paragraph boundary (\n\n)
last_paragraph = chunk_text.rfind("\n\n")
if last_paragraph > max_size * 0.5: # Only if it's past halfway
break_point = start + last_paragraph + 2 # +2 to skip the \n\n

# Try to break at line boundary (\n)
if break_point is None:
last_newline = chunk_text.rfind("\n")
if last_newline > max_size * 0.5:
break_point = start + last_newline + 1

# Try to break at sentence boundary (. ! ?)
if break_point is None:
for punct in [". ", "! ", "? "]:
last_sentence = chunk_text.rfind(punct)
if last_sentence > max_size * 0.5:
break_point = start + last_sentence + len(punct)
break

# Try to break at word boundary (space)
if break_point is None:
last_space = chunk_text.rfind(" ")
if last_space > max_size * 0.5:
break_point = start + last_space + 1

# If no good breaking point, just hard cut
if break_point is None:
break_point = end

chunks.append(text[start:break_point].strip())

# Move start position with overlap
start = max(break_point - overlap, start + 1)

return chunks


def function(
request: SearchIndexChunkingV1Request, runtime: Runtime
) -> SearchIndexChunkingV1Response:
Expand Down Expand Up @@ -121,11 +49,7 @@ def function(

# Create chunk outputs
for chunk_text in text_chunks:
# Create citations from source_dmo_fields if available
citations = {}
if metadata and metadata.source_dmo_fields:
for key, value in metadata.source_dmo_fields.items():
citations[key] = str(value)
citations = extract_citations(metadata)

chunk_output = SearchIndexChunkingV1Output(
chunk_type=ChunkType.TEXT,
Expand Down
104 changes: 104 additions & 0 deletions src/datacustomcode/templates/function/chunking/payload/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Utility functions for text chunking operations."""

import logging
from typing import (
Dict,
List,
Optional,
)

from datacustomcode.function.feature_types.chunking import SearchIndexChunkingV1Metadata

logger = logging.getLogger(__name__)


def split_text_into_chunks(text: str, max_size: int, overlap: int = 20) -> List[str]:
"""Split text into chunks with overlap, trying to break at natural boundaries.

Tries to break at natural boundaries in order of preference:
1. Paragraph boundaries (\\n\\n)
2. Line boundaries (\\n)
3. Sentence boundaries (. ! ?)
4. Word boundaries (space)
5. Hard cut if no good boundary found

Args:
text: Text to split
max_size: Maximum characters per chunk
overlap: Number of characters to overlap between chunks

Returns:
List of text chunks
"""
if len(text) <= max_size:
return [text]

chunks = []
start = 0

while start < len(text):
# Determine end position for this chunk
end = start + max_size

if end >= len(text):
# Last chunk
chunks.append(text[start:])
break

# Try to find a good breaking point (in order of preference)
chunk_text = text[start:end]
break_point = None

# Try to break at paragraph boundary (\n\n)
last_paragraph = chunk_text.rfind("\n\n")
if last_paragraph > max_size * 0.5: # Only if it's past halfway
break_point = start + last_paragraph + 2 # +2 to skip the \n\n

# Try to break at line boundary (\n)
if break_point is None:
last_newline = chunk_text.rfind("\n")
if last_newline > max_size * 0.5:
break_point = start + last_newline + 1

# Try to break at sentence boundary (. ! ?)
if break_point is None:
for punct in [". ", "! ", "? "]:
last_sentence = chunk_text.rfind(punct)
if last_sentence > max_size * 0.5:
break_point = start + last_sentence + len(punct)
break

# Try to break at word boundary (space)
if break_point is None:
last_space = chunk_text.rfind(" ")
if last_space > max_size * 0.5:
break_point = start + last_space + 1

# If no good breaking point, just hard cut
if break_point is None:
break_point = end

chunks.append(text[start:break_point].strip())

# Move start position with overlap
start = max(break_point - overlap, start + 1)

return chunks


def extract_citations(
metadata: Optional[SearchIndexChunkingV1Metadata],
) -> Dict[str, str]:
"""Extract citations from document metadata.

Args:
metadata: Document metadata containing source DMO fields

Returns:
Dictionary of citation key-value pairs
"""
citations = {}
if metadata and metadata.source_dmo_fields:
for key, value in metadata.source_dmo_fields.items():
citations[key] = str(value)
return citations
42 changes: 42 additions & 0 deletions tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,48 @@ def test_run_entrypoint_with_dependencies():
sys.path.remove(module_dir)


def test_add_py_folder_enables_local_imports():
"""Test that add_py_folder adds entrypoint directory to sys.path."""
from datacustomcode.run import add_py_folder

# Create a temporary directory structure
temp_dir = tempfile.mkdtemp()

try:
# Create a utility module in the temp directory
utility_path = os.path.join(temp_dir, "utility.py")
with open(utility_path, "w") as f:
f.write("TEST_VALUE = 'local_module_works'\n")

# Create an entrypoint file
entrypoint_path = os.path.join(temp_dir, "entrypoint.py")
with open(entrypoint_path, "w") as f:
f.write("# Test entrypoint\n")

# Save original sys.path
original_path = sys.path.copy()

# Call add_py_folder with relative path from current directory
relative_entrypoint = os.path.relpath(entrypoint_path)
add_py_folder(relative_entrypoint)

# verify we can now import the utility module
import utility

assert hasattr(utility, "TEST_VALUE"), "utility module should have TEST_VALUE"
assert (
utility.TEST_VALUE == "local_module_works"
), f"Expected 'local_module_works', got {utility.TEST_VALUE}"

finally:
# Cleanup
sys.path = original_path
if "utility" in sys.modules:
del sys.modules["utility"]
if os.path.exists(temp_dir):
shutil.rmtree(temp_dir)


class TestDataspaceScenarios:
"""Test dataspace functionality in run_entrypoint."""

Expand Down
62 changes: 62 additions & 0 deletions tests/test_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,3 +927,65 @@ def test_excluded_packages(self):
assert "pyspark" not in imports
finally:
os.unlink(temp_path)

def test_local_module_exclusion(self):
"""Test that local modules (files in the same directory) are excluded."""
# Create a temporary directory with multiple Python files
temp_dir = tempfile.mkdtemp()

try:
# Create a local module file
utility_path = os.path.join(temp_dir, "utility.py")
with open(utility_path, "w") as f:
f.write(
textwrap.dedent(
"""
def helper_function():
return "helper"
"""
)
)

# Create another local module
helpers_path = os.path.join(temp_dir, "helpers.py")
with open(helpers_path, "w") as f:
f.write(
textwrap.dedent(
"""
def another_helper():
return "another"
"""
)
)

# script imports both local modules and external packages
main_content = textwrap.dedent(
"""
from utility import helper_function
from helpers import another_helper
import pandas as pd
import numpy as np
"""
)
main_path = os.path.join(temp_dir, "main.py")
with open(main_path, "w") as f:
f.write(main_content)

# Scan for imports
imports = scan_file_for_imports(main_path)

# External packages should be included
assert "pandas" in imports
assert "numpy" in imports

# Local modules should be excluded
assert "utility" not in imports
assert "helpers" not in imports

finally:
# Clean up
for file in ["utility.py", "helpers.py", "main.py"]:
file_path = os.path.join(temp_dir, file)
if os.path.exists(file_path):
os.unlink(file_path)
os.rmdir(temp_dir)
Loading