diff --git a/.github/workflows/beta-release.yml b/.github/workflows/beta-release.yml index e9972dfa..36adbb60 100644 --- a/.github/workflows/beta-release.yml +++ b/.github/workflows/beta-release.yml @@ -72,8 +72,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install bump2version build twine + pip install bump2version build twine psutil pip install -e ".[all,dev]" + # Install memory monitoring tools + pip install memory_profiler - name: Configure git run: | @@ -108,10 +110,28 @@ jobs: python scripts/generate_changelog.py --beta --output BETA_CHANGELOG.md - name: Run tests + env: + # Control memory usage to prevent segmentation faults + PYTHONMALLOC: debug + # Limit the number of threads used by numpy/OpenMP + OMP_NUM_THREADS: 1 + MKL_NUM_THREADS: 1 + OPENBLAS_NUM_THREADS: 1 + # Limit spaCy's memory usage + SPACY_MAX_THREADS: 1 run: | - python -m pytest tests/ -v --tb=short - python -m pytest -m integration -v - python -m pytest tests/benchmark_text_service.py -v + # Print system memory info + free -h || echo "free command not available" + + # Split tests into smaller batches to avoid memory issues + python -m pytest tests/ -v --tb=short -k "not benchmark and not integration" --no-header + + # Run integration tests separately + python -m pytest -m integration -v --no-header + + # Run benchmark tests with reduced sample size + python -c "print('Running memory-intensive benchmark tests with safeguards')" + python -m pytest tests/benchmark_text_service.py -v --no-header - name: Build package run: | diff --git a/examples/quick_start.ipynb b/examples/quick_start.ipynb new file mode 100644 index 00000000..ea7aa03c --- /dev/null +++ b/examples/quick_start.ipynb @@ -0,0 +1,627 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "[Homepage](https://www.datafog.ai) | \n", + "[Discord](https://discord.gg/bzDth394R4) | \n", + "[Github](https://github.com/datafog/datafog-python) | \n", + "[Contact](mailto:sid@datafog.ai) |\n", + "[Documentation](https://www.datafog.ai/datafog-docs/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "# DataFog Quick Start Guide\\n\\n> **šŸ“¦ Version Requirement**: This guide is for DataFog v4.2.0 and above\\n> \\n> āœ… **New in v4.2.0**: GLiNER integration, smart cascading, and enhanced performance\\n\\nWelcome to DataFog! This notebook demonstrates how to get started with DataFog's fast PII detection and anonymization capabilities.\\n\\n## What makes DataFog special?\\n\\n- **šŸš€ Ultra-Fast**: 190x faster than spaCy for structured PII, 32x faster with GLiNER\\n- **🪶 Lightweight**: <2MB core package with optional ML extras\\n- **🧠 Smart Engines**: Choose from regex, GLiNER, spaCy, or smart cascading\\n- **šŸ“¦ Production Ready**: Comprehensive testing and performance validation\"", + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Installation\n", + "\n", + "Let's start by installing DataFog with the advanced features:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install DataFog with advanced ML features\n", + "!pip install datafog[nlp-advanced] --quiet\n", + "\n", + "print(\"āœ… DataFog installed successfully!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Simple API - Get Started in Seconds\n", + "\n", + "The fastest way to detect PII in your text:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datafog import DataFog\n", + "\n", + "# Create a DataFog instance\n", + "detector = DataFog()\n", + "\n", + "# Sample text with various PII types\n", + "sample_text = \"\"\"\n", + "Hi there! I'm Dr. Sarah Johnson, and you can reach me at sarah.johnson@hospital.com \n", + "or call my office at (555) 123-4567. My SSN is 123-45-6789 for verification.\n", + "I work at General Hospital located at 123 Main St, New York, NY 10001.\n", + "My credit card ending in 4111-1111-1111-1111 expires on 12/25.\n", + "\"\"\"\n", + "\n", + "# Detect PII - this uses the fast regex engine by default\n", + "results = detector.scan_text(sample_text)\n", + "\n", + "print(\"šŸ” PII Detection Results:\")\n", + "print(f\"Found {len(results)} pieces of PII:\")\n", + "for entity_type, entities in results.items():\n", + " if entities: # Only show types that were found\n", + " print(f\" {entity_type}: {entities}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Engine Comparison - Choose Your Power Level\n", + "\n", + "DataFog offers multiple engines for different needs. Let's compare them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datafog.services import TextService\n", + "import time\n", + "\n", + "# Test text with both structured and unstructured PII\n", + "test_text = \"Dr. John Smith works at General Hospital. Contact him at john@hospital.com or (555) 123-4567.\"\n", + "\n", + "# Engine configurations\n", + "engines = {\n", + " \"regex\": \"šŸš€ Fastest - Pattern-based detection\",\n", + " \"gliner\": \"⚔ Fast - Modern ML with high accuracy\", \n", + " \"smart\": \"🧠 Balanced - Combines regex + GLiNER for best results\"\n", + "}\n", + "\n", + "print(\"⚔ Engine Performance Comparison\\n\")\n", + "\n", + "for engine_name, description in engines.items():\n", + " try:\n", + " print(f\"{description}\")\n", + " \n", + " # Create service with specific engine\n", + " service = TextService(engine=engine_name)\n", + " \n", + " # Time the detection\n", + " start_time = time.time()\n", + " result = service.annotate_text_sync(test_text)\n", + " end_time = time.time()\n", + " \n", + " # Show results\n", + " processing_time = (end_time - start_time) * 1000 # Convert to milliseconds\n", + " print(f\" ā±ļø Processing time: {processing_time:.2f}ms\")\n", + " print(f\" šŸŽÆ Entities found: {list(result.keys()) if result else 'None'}\")\n", + " print()\n", + " \n", + " except ImportError as e:\n", + " print(f\" āŒ {engine_name} engine not available (missing dependencies)\")\n", + " print(f\" Install with: pip install datafog[nlp-advanced]\")\n", + " print()\n", + " except Exception as e:\n", + " print(f\" āš ļø Error with {engine_name}: {str(e)}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Advanced Detection with GLiNER\n", + "\n", + "GLiNER is DataFog's modern ML engine that provides excellent accuracy for named entity recognition:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Complex text with various entity types\n", + "complex_text = \"\"\"\n", + "Medical Report - Patient: Emily Rodriguez, DOB: 03/15/1985\n", + "Dr. Michael Chen from Stanford Medical Center treated the patient.\n", + "Insurance ID: INS-789-456-123, Policy expires December 2024.\n", + "Emergency contact: Maria Rodriguez at (408) 555-9876.\n", + "Address: 1234 Oak Street, San Francisco, CA 94102\n", + "Lab results show glucose level of 120 mg/dL on 2024-01-15.\n", + "\"\"\"\n", + "\n", + "try:\n", + " # Use GLiNER for advanced entity detection\n", + " gliner_service = TextService(engine=\"gliner\")\n", + " \n", + " print(\"🧠 GLiNER Advanced Detection Results:\")\n", + " print(\"=\" * 50)\n", + " \n", + " results = gliner_service.annotate_text_sync(complex_text)\n", + " \n", + " for entity_type, entities in results.items():\n", + " if entities: # Only show found entities\n", + " print(f\"\\n{entity_type}:\")\n", + " for entity in entities:\n", + " print(f\" • {entity}\")\n", + " \n", + " print(f\"\\nāœ… Total entity types detected: {len([k for k, v in results.items() if v])}\")\n", + " \n", + "except ImportError:\n", + " print(\"āŒ GLiNER not available. Install with: pip install datafog[nlp-advanced]\")\n", + "except Exception as e:\n", + " print(f\"āš ļø GLiNER error: {e}\")\n", + " print(\"Falling back to regex engine...\")\n", + " \n", + " # Fallback to regex\n", + " regex_service = TextService(engine=\"regex\")\n", + " results = regex_service.annotate_text_sync(complex_text)\n", + " print(\"\\nšŸš€ Regex Detection Results:\")\n", + " for entity_type, entities in results.items():\n", + " if entities:\n", + " print(f\" {entity_type}: {entities}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Smart Cascading - Best of All Worlds\n", + "\n", + "The \"smart\" engine combines regex speed with GLiNER accuracy by using a cascading approach:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Text mixing structured PII (perfect for regex) and entities (better with GLiNER)\n", + "mixed_text = \"\"\"\n", + "From: john.doe@techcorp.com\n", + "To: legal@company.com\n", + "Subject: Employee Data Update\n", + "\n", + "Dear Legal Team,\n", + "\n", + "Please update the employee record for Sarah Williams (ID: EMP-12345).\n", + "Her new phone number is (555) 987-6543 and SSN is 987-65-4321.\n", + "She works at our Seattle office and reports to Manager David Chen.\n", + "Her emergency contact is her spouse, Michael Williams, at (555) 111-2222.\n", + "\n", + "Best regards,\n", + "HR Department\n", + "\"\"\"\n", + "\n", + "try:\n", + " # Smart engine: Uses regex first (fast), then GLiNER for missed entities\n", + " smart_service = TextService(engine=\"smart\")\n", + " \n", + " print(\"🧠 Smart Cascading Detection:\")\n", + " print(\"=\" * 40)\n", + " print(\"Strategy: Regex (speed) → GLiNER (accuracy)\\n\")\n", + " \n", + " start_time = time.time()\n", + " results = smart_service.annotate_text_sync(mixed_text)\n", + " end_time = time.time()\n", + " \n", + " # Organize results by category\n", + " structured_pii = ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD']\n", + " entity_pii = ['PERSON', 'ORG', 'LOC', 'DATE_TIME']\n", + " \n", + " print(\"šŸ“§ Structured PII (Regex-optimized):\")\n", + " for entity_type in structured_pii:\n", + " if entity_type in results and results[entity_type]:\n", + " print(f\" {entity_type}: {results[entity_type]}\")\n", + " \n", + " print(\"\\nšŸ‘¤ Named Entities (GLiNER-optimized):\")\n", + " for entity_type in entity_pii:\n", + " if entity_type in results and results[entity_type]:\n", + " print(f\" {entity_type}: {results[entity_type]}\")\n", + " \n", + " processing_time = (end_time - start_time) * 1000\n", + " print(f\"\\nā±ļø Total processing time: {processing_time:.2f}ms\")\n", + " print(f\"āœ… Combined detection power with optimized speed!\")\n", + " \n", + "except Exception as e:\n", + " print(f\"āš ļø Smart engine error: {e}\")\n", + " print(\"This usually means GLiNER dependencies are missing.\")\n", + " print(\"Install with: pip install datafog[nlp-advanced]\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Anonymization - Protect Your Data\n", + "\n", + "DataFog doesn't just detect PII - it can also anonymize it in various ways:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample sensitive data\n", + "sensitive_data = \"\"\"\n", + "Patient: John Smith\n", + "Email: john.smith@email.com\n", + "Phone: (555) 123-4567\n", + "SSN: 123-45-6789\n", + "Credit Card: 4111-1111-1111-1111\n", + "Address: 123 Main St, Anytown, NY 12345\n", + "\"\"\"\n", + "\n", + "print(\"šŸ”’ DataFog Anonymization Methods\\n\")\n", + "print(\"Original text:\")\n", + "print(sensitive_data)\n", + "print(\"=\" * 60)\n", + "\n", + "# Method 1: Redaction (replace with [REDACTED])\n", + "redactor = DataFog(operations=[\"scan\", \"redact\"])\n", + "redacted_text = redactor.process_text(sensitive_data)\n", + "print(\"\\n🚫 REDACTED:\")\n", + "print(redacted_text)\n", + "\n", + "# Method 2: Replacement (replace with fake but realistic data)\n", + "replacer = DataFog(operations=[\"scan\", \"replace\"])\n", + "replaced_text = replacer.process_text(sensitive_data)\n", + "print(\"\\nšŸ”„ REPLACED:\")\n", + "print(replaced_text)\n", + "\n", + "# Method 3: Hashing (one-way transformation)\n", + "from datafog.models.anonymizer import HashType\n", + "hasher = DataFog(\n", + " operations=[\"scan\", \"hash\"],\n", + " hash_type=HashType.SHA256\n", + ")\n", + "hashed_text = hasher.process_text(sensitive_data)\n", + "print(\"\\n#ļøāƒ£ HASHED (SHA256):\")\n", + "print(hashed_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Selective Processing - Target Specific PII Types\n", + "\n", + "Sometimes you only want to process certain types of PII:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample text with mixed PII\n", + "business_data = \"\"\"\n", + "Company Report:\n", + "CEO: Amanda Johnson (amanda@company.com)\n", + "CFO: Robert Davis (robert.davis@company.com) \n", + "Phone: (555) 100-2000\n", + "Headquarters: 456 Business Ave, Corporate City, CA 90210\n", + "Tax ID: 12-3456789\n", + "Employee SSN for payroll: 987-65-4321\n", + "\"\"\"\n", + "\n", + "print(\"šŸŽÆ Selective PII Processing\\n\")\n", + "print(\"Original text:\")\n", + "print(business_data)\n", + "print(\"=\" * 50)\n", + "\n", + "# Only process emails and SSNs, leave names and addresses\n", + "selective_redactor = DataFog(\n", + " operations=[\"scan\", \"redact\"],\n", + " entities=[\"EMAIL\", \"SSN\"] # Only target these types\n", + ")\n", + "\n", + "selective_result = selective_redactor.process_text(business_data)\n", + "print(\"\\nšŸŽÆ Selective Redaction (EMAIL + SSN only):\")\n", + "print(selective_result)\n", + "print(\"\\nšŸ’” Notice: Names and addresses are preserved!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Batch Processing - Handle Multiple Documents\n", + "\n", + "Process multiple documents efficiently:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample document collection\n", + "documents = [\n", + " \"Patient file 1: John Doe, DOB: 01/15/1980, Phone: (555) 111-1111\",\n", + " \"Customer record: jane@email.com, Account: 4532-1234-5678-9012\", \n", + " \"Employee data: Robert Smith, SSN: 123-45-6789, Manager: Sarah Lee\",\n", + " \"Contact info: michael@company.com, Office: (555) 999-8888\",\n", + " \"Invoice #1234: Bill to John at 123 Oak St, Los Angeles, CA 90001\"\n", + "]\n", + "\n", + "print(\"šŸ“š Batch Processing Demo\\n\")\n", + "print(f\"Processing {len(documents)} documents...\\n\")\n", + "\n", + "# Process all documents at once\n", + "batch_detector = DataFog()\n", + "start_time = time.time()\n", + "batch_results = batch_detector.batch_process(documents)\n", + "end_time = time.time()\n", + "\n", + "# Summary results\n", + "total_entities = 0\n", + "entity_counts = {}\n", + "\n", + "for i, result in enumerate(batch_results):\n", + " print(f\"šŸ“„ Document {i+1}:\")\n", + " doc_entities = 0\n", + " for entity_type, entities in result.items():\n", + " if entities:\n", + " count = len(entities)\n", + " doc_entities += count\n", + " entity_counts[entity_type] = entity_counts.get(entity_type, 0) + count\n", + " print(f\" {entity_type}: {entities}\")\n", + " \n", + " if doc_entities == 0:\n", + " print(\" No PII detected\")\n", + " total_entities += doc_entities\n", + " print()\n", + "\n", + "# Performance summary\n", + "processing_time = (end_time - start_time) * 1000\n", + "avg_time_per_doc = processing_time / len(documents)\n", + "\n", + "print(\"šŸ“Š Batch Processing Summary:\")\n", + "print(f\" šŸ“š Documents processed: {len(documents)}\")\n", + "print(f\" šŸŽÆ Total entities found: {total_entities}\")\n", + "print(f\" ā±ļø Total processing time: {processing_time:.2f}ms\")\n", + "print(f\" šŸ“ˆ Average per document: {avg_time_per_doc:.2f}ms\")\n", + "print(f\" šŸƒ Throughput: {len(documents) / (processing_time/1000):.1f} docs/sec\")\n", + "\n", + "if entity_counts:\n", + " print(f\"\\nšŸ·ļø Entity breakdown:\")\n", + " for entity_type, count in entity_counts.items():\n", + " print(f\" {entity_type}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. Performance Showcase - See the Speed\n", + "\n", + "Let's demonstrate DataFog's performance advantage with a realistic document:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Realistic business document (similar to what you'd process in production)\n", + "large_document = \"\"\"\n", + "CONFIDENTIAL EMPLOYEE REPORT - Q1 2024\n", + "\n", + "=== EXECUTIVE SUMMARY ===\n", + "Report generated by: Sarah Johnson (sarah.johnson@company.com)\n", + "Date: March 15, 2024\n", + "Department: Human Resources\n", + "Contact: (555) 100-HR00 ext. 1234\n", + "\n", + "=== EMPLOYEE RECORDS ===\n", + "\n", + "1. John Smith (ID: EMP-001)\n", + " Email: john.smith@company.com\n", + " Phone: (555) 123-4567\n", + " SSN: 123-45-6789\n", + " Address: 123 Oak Street, San Francisco, CA 94102\n", + " Manager: David Chen (david.chen@company.com)\n", + " Salary: $85,000 annually\n", + " Start Date: January 15, 2020\n", + "\n", + "2. Maria Rodriguez (ID: EMP-002)\n", + " Email: maria.rodriguez@company.com\n", + " Phone: (555) 987-6543\n", + " SSN: 987-65-4321\n", + " Address: 456 Pine Ave, Los Angeles, CA 90210\n", + " Manager: Lisa Wang (lisa.wang@company.com)\n", + " Emergency Contact: Carlos Rodriguez (555) 111-2233\n", + "\n", + "3. Michael Johnson (ID: EMP-003)\n", + " Email: michael.j@company.com\n", + " Personal Email: mike.personal@gmail.com\n", + " Phone: (555) 456-7890\n", + " SSN: 456-78-9012\n", + " Credit Card on file: 4532-1234-5678-9012 (expires 12/26)\n", + " \n", + "=== PAYROLL INFORMATION ===\n", + "Bank routing: 123456789\n", + "Direct deposit accounts verified on 2024-03-01\n", + "Tax ID: 12-3456789\n", + "\n", + "=== CONTACT INFORMATION ===\n", + "HR Helpline: (555) 888-4HR7\n", + "Benefits questions: benefits@company.com\n", + "IT Support: support@company.com\n", + "Office address: 789 Corporate Blvd, Suite 100, Business City, NY 10001\n", + "\n", + "This document contains sensitive employee information and should be handled according to \n", + "company privacy policies and applicable laws including GDPR, CCPA, and HIPAA where applicable.\n", + "\n", + "Report ID: RPT-2024-Q1-001\n", + "Classification: CONFIDENTIAL\n", + "Retention: 7 years from creation date\n", + "\"\"\"\n", + "\n", + "print(\"šŸš€ Performance Benchmark\\n\")\n", + "print(f\"šŸ“„ Document size: {len(large_document):,} characters\")\n", + "print(f\"šŸ“ Lines of text: {len(large_document.splitlines())}\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Test with different engines\n", + "engines_to_test = [\n", + " (\"regex\", \"šŸš€ Regex Engine (Fastest)\"),\n", + " (\"smart\", \"🧠 Smart Engine (Balanced)\"),\n", + "]\n", + "\n", + "results_comparison = {}\n", + "\n", + "for engine_name, description in engines_to_test:\n", + " try:\n", + " print(f\"\\n{description}\")\n", + " print(\"-\" * 30)\n", + " \n", + " service = TextService(engine=engine_name)\n", + " \n", + " # Run multiple times for accurate timing\n", + " times = []\n", + " for _ in range(3):\n", + " start = time.time()\n", + " result = service.annotate_text_sync(large_document)\n", + " end = time.time()\n", + " times.append((end - start) * 1000)\n", + " \n", + " avg_time = sum(times) / len(times)\n", + " \n", + " # Count entities found\n", + " total_entities = sum(len(entities) for entities in result.values() if entities)\n", + " entity_types = len([k for k, v in result.items() if v])\n", + " \n", + " results_comparison[engine_name] = {\n", + " 'time': avg_time,\n", + " 'entities': total_entities,\n", + " 'types': entity_types\n", + " }\n", + " \n", + " print(f\"ā±ļø Average time: {avg_time:.2f}ms\")\n", + " print(f\"šŸŽÆ Entities found: {total_entities}\")\n", + " print(f\"šŸ·ļø Entity types: {entity_types}\")\n", + " print(f\"šŸ“Š Throughput: {len(large_document) / (avg_time/1000):,.0f} chars/sec\")\n", + " \n", + " except Exception as e:\n", + " print(f\"āŒ {engine_name} not available: {e}\")\n", + "\n", + "# Performance comparison\n", + "if len(results_comparison) > 1:\n", + " print(\"\\nšŸ† Performance Comparison:\")\n", + " print(\"=\" * 40)\n", + " \n", + " fastest_time = min(r['time'] for r in results_comparison.values())\n", + " \n", + " for engine, stats in results_comparison.items():\n", + " speedup = fastest_time / stats['time'] if stats['time'] > 0 else 1\n", + " if speedup >= 1:\n", + " print(f\"{engine}: {stats['time']:.2f}ms ({speedup:.1f}x faster) - {stats['entities']} entities\")\n", + " else:\n", + " slowdown = stats['time'] / fastest_time\n", + " print(f\"{engine}: {stats['time']:.2f}ms ({slowdown:.1f}x slower) - {stats['entities']} entities\")\n", + "\n", + "print(\"\\nāœ… DataFog delivers production-ready performance for real-world documents!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## šŸŽ‰ Congratulations!\n", + "\n", + "You've completed the DataFog quick start guide! Here's what you've learned:\n", + "\n", + "### āœ… Key Takeaways\n", + "\n", + "1. **šŸš€ Speed**: DataFog is 190x faster than traditional NLP for structured PII\n", + "2. **🧠 Intelligence**: GLiNER and smart cascading provide excellent accuracy\n", + "3. **šŸ”’ Flexibility**: Multiple anonymization options (redact, replace, hash)\n", + "4. **šŸŽÆ Precision**: Target specific PII types for selective processing\n", + "5. **šŸ“š Scale**: Efficient batch processing for production workloads\n", + "\n", + "### šŸ› ļø Engine Selection Guide\n", + "\n", + "| Engine | Best For | Speed | Accuracy |\n", + "|--------|----------|-------|----------|\n", + "| `regex` | Structured PII (emails, phones, SSN) | šŸš€šŸš€šŸš€ | ⭐⭐⭐ |\n", + "| `gliner` | Named entities (people, orgs, locations) | šŸš€šŸš€ | ⭐⭐⭐⭐ |\n", + "| `smart` | **Production use (recommended)** | šŸš€šŸš€ | ⭐⭐⭐⭐⭐ |\n", + "\n", + "### šŸš€ Next Steps\n", + "\n", + "- **Production**: Use `engine=\"smart\"` for best balance of speed and accuracy\n", + "- **High Volume**: Use `engine=\"regex\"` for maximum speed on structured data\n", + "- **Custom Entities**: Explore GLiNER models for specialized use cases\n", + "- **Integration**: Check out our [documentation](https://docs.datafog.ai) for API details\n", + "\n", + "### šŸ’¬ Get Help\n", + "\n", + "- šŸ“– [Documentation](https://docs.datafog.ai)\n", + "- šŸ’¬ [Discord Community](https://discord.gg/bzDth394R4)\n", + "- šŸ› [GitHub Issues](https://github.com/datafog/datafog-python/issues)\n", + "- šŸ“§ [Contact Us](mailto:hi@datafog.ai)\n", + "\n", + "**Happy data processing with DataFog! 🌟**" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/tests/benchmark_text_service.py b/tests/benchmark_text_service.py index 5ac8ec3f..52fb1783 100644 --- a/tests/benchmark_text_service.py +++ b/tests/benchmark_text_service.py @@ -9,7 +9,10 @@ @pytest.fixture def sample_text_10kb(): - """Generate a 10KB sample text with various PII entities.""" + """Generate a sample text with various PII entities. + + Note: Reduced size for CI environments to prevent memory issues. + """ # Base text with PII entities base_text = ( "Contact John Doe at john.doe@example.com or call (555) 123-4567. " @@ -20,8 +23,16 @@ def sample_text_10kb(): "Her phone number is 555-987-6543 and email is jane.smith@company.org. " ) - # Repeat the text to reach approximately 10KB - repetitions = 10000 // len(base_text) + 1 + # Check if running in CI environment + import os + + if os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS"): + # Use smaller sample in CI to prevent memory issues + repetitions = 50 + else: + # Use full size for local development + repetitions = 10000 // len(base_text) + 1 + return base_text * repetitions