diff --git a/.github/workflows/beta-release.yml b/.github/workflows/beta-release.yml
index 6c7cb670..529ceca0 100644
--- a/.github/workflows/beta-release.yml
+++ b/.github/workflows/beta-release.yml
@@ -2,8 +2,7 @@ name: Beta Release (Thursday)
on:
schedule:
- # Thursday at 2 AM UTC - consolidate week's alpha changes into beta
- - cron: '0 2 * * 4'
+ - cron: '0 2 * * 4' # Thursday at 2 AM UTC
workflow_dispatch:
inputs:
dry_run:
@@ -33,28 +32,23 @@ jobs:
- name: Check for changes since last beta release
id: changes
run: |
- # Get last beta release tag
LAST_BETA=$(git tag -l "*b*" --sort=-version:refname | head -n1)
-
+
if [ -z "$LAST_BETA" ]; then
- echo "No previous beta release found, checking last week"
- SINCE="1 week ago"
- COMMIT_COUNT=$(git rev-list --count --since="$SINCE" dev)
+ echo "No previous beta release found"
+ COMMIT_COUNT=$(git rev-list --count --since="1 week ago" dev)
else
echo "Last beta release: $LAST_BETA"
COMMIT_COUNT=$(git rev-list --count ${LAST_BETA}..dev)
fi
-
- echo "Commits since last beta: $COMMIT_COUNT"
+
echo "commit_count=$COMMIT_COUNT" >> $GITHUB_OUTPUT
echo "last_beta=$LAST_BETA" >> $GITHUB_OUTPUT
-
- if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build }}" = "true" ]; then
+
+ if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build == 'true' }}" = "true" ]; then
echo "has_changes=true" >> $GITHUB_OUTPUT
- echo "โ
Changes detected, proceeding with beta build"
else
echo "has_changes=false" >> $GITHUB_OUTPUT
- echo "โน๏ธ No changes since last beta, skipping build"
fi
beta-release:
@@ -89,22 +83,18 @@ jobs:
- name: Generate beta version
id: version
run: |
- # Get current version
+ set -e
CURRENT_VERSION=$(python -c "from datafog.__about__ import __version__; print(__version__)")
echo "Current version: $CURRENT_VERSION"
-
- # Generate beta version
+
if [[ $CURRENT_VERSION == *"b"* ]]; then
- # If already beta, increment beta number
BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'b' -f1)
BETA_NUM=$(echo $CURRENT_VERSION | cut -d'b' -f2)
BETA_VERSION="${BASE_VERSION}b$((BETA_NUM + 1))"
elif [[ $CURRENT_VERSION == *"a"* ]]; then
- # If alpha, convert to beta
BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'a' -f1)
BETA_VERSION="${BASE_VERSION}b1"
else
- # If stable, bump minor and add beta (4.1.1 -> 4.2.0)
BASE_VERSION=$(python3 -c "
version = '$CURRENT_VERSION'
parts = version.split('.')
@@ -114,55 +104,36 @@ print('.'.join(parts))
")
BETA_VERSION="${BASE_VERSION}b1"
fi
-
- echo "Beta version: $BETA_VERSION"
+
echo "beta_version=$BETA_VERSION" >> $GITHUB_OUTPUT
-
- # Update version in files
sed -i "s/__version__ = \".*\"/__version__ = \"$BETA_VERSION\"/" datafog/__about__.py
sed -i "s/version=\".*\"/version=\"$BETA_VERSION\"/" setup.py
- - name: Generate changelog for beta
+ - name: Generate changelog
run: |
python scripts/generate_changelog.py --beta --output BETA_CHANGELOG.md
- - name: Run comprehensive tests
+ - name: Run tests
run: |
- echo "๐งช Running comprehensive test suite for beta release..."
-
- # Run core tests
python -m pytest tests/ -v --tb=short
-
- # Run integration tests
python -m pytest -m integration -v
-
- # Run benchmarks to ensure performance
python -m pytest tests/benchmark_text_service.py -v
-
- echo "โ
All tests passed for beta release"
- name: Build package
run: |
python -m build
-
- # Verify wheel size
python scripts/check_wheel_size.py
-
- echo "๐ฆ Beta package built successfully"
- - name: Create beta release
+ - name: Create GitHub release
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
BETA_VERSION="${{ steps.version.outputs.beta_version }}"
-
- # Create and push tag
git add datafog/__about__.py setup.py
git commit -m "chore: bump version to $BETA_VERSION for beta release"
git tag -a "v$BETA_VERSION" -m "Beta release $BETA_VERSION"
git push origin "v$BETA_VERSION"
-
- # Create GitHub release
+
gh release create "v$BETA_VERSION" \
--title "๐ง Beta Release $BETA_VERSION" \
--notes-file BETA_CHANGELOG.md \
@@ -170,35 +141,29 @@ print('.'.join(parts))
--target dev \
dist/*
- - name: Publish to PyPI (Beta)
+ - name: Publish to PyPI
if: github.event.inputs.dry_run != 'true'
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
run: |
- echo "๐ Publishing beta release to PyPI..."
python -m twine upload dist/* --verbose
- name: Dry run summary
if: github.event.inputs.dry_run == 'true'
run: |
- echo "๐โโ๏ธ DRY RUN COMPLETED"
+ echo "๐ DRY RUN COMPLETE"
echo "Would have published: ${{ steps.version.outputs.beta_version }}"
- echo "Package contents:"
ls -la dist/
- echo "Test results: All tests would be run"
- - name: Cleanup old beta releases
+ - name: Cleanup old betas
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
- echo "๐งน Cleaning up old beta releases (keep last 5)..."
-
- # Get all beta releases, sorted by creation date
- BETA_RELEASES=$(gh release list --limit 30 | grep "๐ง.*b[0-9]" | tail -n +6 | cut -f3)
-
+ BETA_RELEASES=$(gh release list --limit 30 | grep b | tail -n +6 | cut -f3)
+
for release in $BETA_RELEASES; do
- echo "Deleting old beta release: $release"
+ echo "Deleting $release"
gh release delete "$release" --yes || true
git push --delete origin "$release" || true
done
@@ -210,35 +175,6 @@ print('.'.join(parts))
steps:
- name: Beta release notification
run: |
- echo "๐ง Thursday beta release completed!"
- echo "๐ฆ Beta version ready for final testing"
- echo "๐ก Install with: pip install datafog==${{ needs.beta-release.outputs.beta_version }}"
- echo "๐ Commits included: ${{ needs.check-changes.outputs.commit_count }}"
- echo "๐๏ธ Stable release scheduled for Friday"
- echo ""
- echo "๐งช Beta Testing Checklist:"
- echo " โ
All automated tests passed"
- echo " โณ Manual testing recommended"
- echo " โณ Performance validation"
- echo " โณ Integration testing"
-
- prepare-friday-release:
- needs: [beta-release]
- if: success()
- runs-on: ubuntu-latest
- steps:
- - name: Prepare Friday stable release
- run: |
- echo "๐ฏ Preparing for Friday stable release..."
- echo "Current beta: ${{ needs.beta-release.outputs.beta_version }}"
-
- # Extract base version for Friday
- BETA_VERSION="${{ needs.beta-release.outputs.beta_version }}"
- STABLE_VERSION=$(echo $BETA_VERSION | cut -d'b' -f1)
-
- echo "Planned stable version: $STABLE_VERSION"
- echo "๐ Friday Release Checklist:"
- echo " โณ Final beta testing"
- echo " โณ Update CHANGELOG.md"
- echo " โณ Run weekly release workflow"
- echo " โณ Social media announcement"
\ No newline at end of file
+ echo "๐ง Beta release completed!"
+ echo "Install: pip install datafog==${{ needs.beta-release.outputs.beta_version }}"
+ echo "Commits since last beta: ${{ needs.check-changes.outputs.commit_count }}"
diff --git a/.github/workflows/nightly-release.yml b/.github/workflows/nightly-release.yml
index c83e744f..85168f29 100644
--- a/.github/workflows/nightly-release.yml
+++ b/.github/workflows/nightly-release.yml
@@ -32,9 +32,8 @@ jobs:
- name: Check for changes since last alpha release
id: changes
run: |
- # Get last alpha release tag
LAST_ALPHA=$(git tag -l "*alpha*" --sort=-version:refname | head -n1)
-
+
if [ -z "$LAST_ALPHA" ]; then
echo "No previous alpha release found, checking last 24 hours"
SINCE="24 hours ago"
@@ -43,11 +42,11 @@ jobs:
echo "Last alpha release: $LAST_ALPHA"
COMMIT_COUNT=$(git rev-list --count ${LAST_ALPHA}..dev)
fi
-
+
echo "Commits since last alpha: $COMMIT_COUNT"
echo "commit_count=$COMMIT_COUNT" >> $GITHUB_OUTPUT
-
- if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build }}" = "true" ]; then
+
+ if [ "$COMMIT_COUNT" -gt 0 ] || [ "${{ github.event.inputs.force_build == 'true' }}" = "true" ]; then
echo "has_changes=true" >> $GITHUB_OUTPUT
echo "โ
Changes detected, proceeding with nightly build"
else
@@ -87,20 +86,18 @@ jobs:
- name: Generate alpha version
id: version
run: |
- # Get current version
+ set -e
+
CURRENT_VERSION=$(python -c "from datafog.__about__ import __version__; print(__version__)")
echo "Current version: $CURRENT_VERSION"
-
- # Generate alpha version with timestamp
+
DATE_STAMP=$(date +"%Y%m%d")
TIME_STAMP=$(date +"%H%M")
COMMIT_SHORT=$(git rev-parse --short HEAD)
-
- # If current version already has alpha, increment it
+
if [[ $CURRENT_VERSION == *"alpha"* ]]; then
BASE_VERSION=$(echo $CURRENT_VERSION | cut -d'a' -f1)
else
- # Bump minor version for alpha (4.1.1 -> 4.2.0)
BASE_VERSION=$(python3 -c "
version = '$CURRENT_VERSION'
parts = version.split('.')
@@ -109,12 +106,11 @@ parts[2] = '0'
print('.'.join(parts))
")
fi
-
+
ALPHA_VERSION="${BASE_VERSION}a${DATE_STAMP}.${TIME_STAMP}.${COMMIT_SHORT}"
echo "Alpha version: $ALPHA_VERSION"
echo "alpha_version=$ALPHA_VERSION" >> $GITHUB_OUTPUT
-
- # Update version in files
+
sed -i "s/__version__ = \".*\"/__version__ = \"$ALPHA_VERSION\"/" datafog/__about__.py
sed -i "s/version=\".*\"/version=\"$ALPHA_VERSION\"/" setup.py
@@ -125,8 +121,6 @@ print('.'.join(parts))
- name: Build package
run: |
python -m build
-
- # Verify wheel size
python scripts/check_wheel_size.py
- name: Create alpha release
@@ -134,14 +128,12 @@ print('.'.join(parts))
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
ALPHA_VERSION="${{ steps.version.outputs.alpha_version }}"
-
- # Create and push tag
+
git add datafog/__about__.py setup.py
git commit -m "chore: bump version to $ALPHA_VERSION for nightly release"
git tag -a "v$ALPHA_VERSION" -m "Alpha release $ALPHA_VERSION"
git push origin "v$ALPHA_VERSION"
-
- # Create GitHub release
+
gh release create "v$ALPHA_VERSION" \
--title "๐ Nightly Alpha $ALPHA_VERSION" \
--notes-file ALPHA_CHANGELOG.md \
@@ -171,10 +163,9 @@ print('.'.join(parts))
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "๐งน Cleaning up old alpha releases (keep last 7)..."
-
- # Get all alpha releases, sorted by creation date
+
ALPHA_RELEASES=$(gh release list --limit 50 | grep alpha | tail -n +8 | cut -f3)
-
+
for release in $ALPHA_RELEASES; do
echo "Deleting old alpha release: $release"
gh release delete "$release" --yes || true
@@ -191,4 +182,4 @@ print('.'.join(parts))
echo "๐ Nightly alpha release completed!"
echo "๐ฆ New alpha version available for testing"
echo "๐ก Install with: pip install datafog==${{ needs.nightly-release.outputs.alpha_version }}"
- echo "๐ Commits included: ${{ needs.check-changes.outputs.commit_count }}"
\ No newline at end of file
+ echo "๐ Commits included: ${{ needs.check-changes.outputs.commit_count }}"
diff --git a/README.md b/README.md
index 8b17a6ba..7ed37570 100644
--- a/README.md
+++ b/README.md
@@ -1,522 +1,382 @@
+# DataFog: PII Detection & Anonymization
+
-
+
- Comprehensive PII Detection & Anonymization
- Intelligent Engine Selection โข Lightweight โข Production Ready
+ Fast processing โข Production-ready โข Simple configuration
-
+
-
-
-
-
-
-DataFog is a comprehensive open-source library for detecting and anonymizing personally identifiable information (PII) in unstructured data. Built for production workloads, it delivers intelligent engine selection to handle both structured identifiers and contextual entities across different industries and use cases.
-
-## โก Why Choose DataFog?
+---
-**๐ง Intelligent Engine Selection**
+## Overview
-- Automatically chooses the best detection approach for your data
-- Pattern-based engine for structured PII (emails, phones, SSNs, credit cards)
-- NLP-based engine for contextual entities (names, organizations, locations)
-- Industry-optimized detection across financial, healthcare, legal, and enterprise domains
+DataFog provides efficient PII detection using a pattern-first approach that processes text significantly faster than traditional NLP methods while maintaining high accuracy.
-**๐ฆ Lightweight & Modular**
+```python
+# Basic usage example
+from datafog import DataFog
+results = DataFog().scan_text("John's email is john@example.com and SSN is 123-45-6789")
+```
-- Core package under 2MB (vs 800MB+ alternatives)
-- Install only what you need: `datafog[nlp]`, `datafog[ocr]`, `datafog[all]`
-- Zero ML model downloads for basic usage
+### Performance Comparison
-**๐ฏ Production Ready**
+| Engine | 10KB Text Processing | Relative Speed |
+| --------------------- | -------------------- | --------------- |
+| **DataFog (Pattern)** | ~4ms | **123x faster** |
+| spaCy | ~480ms | baseline |
-- Comprehensive PII coverage for diverse enterprise needs
-- Battle-tested detection patterns with high precision
-- Comprehensive test suite with 99.4% coverage
-- CLI tools and Python SDK for any workflow
+### Supported PII Types
-**๐ง Developer Friendly**
+| Type | Examples | Use Cases |
+| ---------------- | ------------------- | ---------------------- |
+| **Email** | john@company.com | Contact scrubbing |
+| **Phone** | (555) 123-4567 | Call log anonymization |
+| **SSN** | 123-45-6789 | HR data protection |
+| **Credit Cards** | 4111-1111-1111-1111 | Payment processing |
+| **IP Addresses** | 192.168.1.1 | Network log cleaning |
+| **Dates** | 01/01/1990 | Birthdate removal |
+| **ZIP Codes** | 12345-6789 | Location anonymization |
-- Simple API: `detect("Contact john@example.com")`
-- Multiple anonymization methods: redact, replace, hash
-- OCR support for images and documents
+---
-## Installation
+## Quick Start
-DataFog can be installed via pip:
+### Installation
-```
+```bash
pip install datafog
```
-# CLI
+### Basic Usage
-## ๐ Quick Reference
+**Detect PII in text:**
-| Command | Description |
-| ------------------- | ------------------------------------ |
-| `scan-text` | Analyze text for PII |
-| `scan-image` | Extract and analyze text from images |
-| `redact-text` | Redact PII in text |
-| `replace-text` | Replace PII with anonymized values |
-| `hash-text` | Hash PII in text |
-| `health` | Check service status |
-| `show-config` | Display current settings |
-| `download-model` | Get a specific spaCy model |
-| `list-spacy-models` | Show available models |
-| `list-entities` | View supported PII entities |
-
----
+```python
+from datafog import DataFog
-## ๐ Detailed Usage
+# Simple detection
+detector = DataFog()
+text = "Contact John Doe at john.doe@company.com or (555) 123-4567"
+results = detector.scan_text(text)
+print(results)
+# Finds: emails, phone numbers, and more
+```
-### Scanning Text
+**Anonymize on the fly:**
-To scan and annotate text for PII entities:
+```python
+# Redact sensitive data
+redacted = DataFog(operations=["scan", "redact"]).process_text(
+ "My SSN is 123-45-6789 and email is john@example.com"
+)
+print(redacted)
+# Output: "My SSN is [REDACTED] and email is [REDACTED]"
-```bash
-datafog scan-text "Your text here"
+# Replace with fake data
+replaced = DataFog(operations=["scan", "replace"]).process_text(
+ "Call me at (555) 123-4567"
+)
+print(replaced)
+# Output: "Call me at [PHONE_A1B2C3]"
```
-**Example:**
-
-```bash
-datafog scan-text "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-```
+**Process images with OCR:**
-### Scanning Images
+```python
+import asyncio
+from datafog import DataFog
-To extract text from images and optionally perform PII annotation:
+async def scan_document():
+ ocr_scanner = DataFog(operations=["extract", "scan"])
+ results = await ocr_scanner.run_ocr_pipeline([
+ "https://example.com/document.png"
+ ])
+ return results
-```bash
-datafog scan-image "path/to/image.png" --operations extract
+# Extract text and find PII in images
+results = asyncio.run(scan_document())
```
-**Example:**
-
-```bash
-datafog scan-image "nokia-statement.png" --operations extract
-```
+---
-To extract text and annotate PII:
+## Advanced Features
-```bash
-datafog scan-image "nokia-statement.png" --operations scan
-```
+### Engine Selection
-### Redacting Text
+Choose the appropriate engine for your needs:
-To redact PII in text:
+```python
+from datafog.services import TextService
-```bash
-datafog redact-text "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-```
+# Pattern: Fast, pattern-based (recommended)
+pattern_service = TextService(engine="pattern")
-which should output:
+# spaCy: Comprehensive NLP with broader entity recognition
+spacy_service = TextService(engine="spacy")
-```bash
-[REDACTED] is the CEO of [REDACTED] and is based out of [REDACTED], [REDACTED]
+# Auto: Combines both - tries pattern first, falls back to spaCy
+auto_service = TextService(engine="auto") # Default
```
-### Replacing Text
-
-To replace detected PII:
+### Anonymization Options
-```bash
-datafog replace-text "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-```
+```python
+from datafog import DataFog
+from datafog.models.anonymizer import AnonymizerType, HashType
-which should return something like:
+# Hash with different algorithms
+hasher = DataFog(
+ operations=["scan", "hash"],
+ hash_type=HashType.SHA256 # or MD5, SHA3_256
+)
-```bash
-[PERSON_B86CACE6] is the CEO of [UNKNOWN_445944D7] and is based out of [UNKNOWN_32BA5DCA], [UNKNOWN_B7DF4969]
+# Target specific entity types only
+selective = DataFog(
+ operations=["scan", "redact"],
+ entities=["EMAIL", "PHONE"] # Only process these types
+)
```
-Note: a unique randomly generated identifier is created for each detected entity
+### Batch Processing
-### Hashing Text
-
-You can select from SHA256, SHA3-256, and MD5 hashing algorithms to hash detected PII. Currently the hashed output does not match the length of the original entity, for privacy-preserving purposes. The default is SHA256.
+```python
+documents = [
+ "Document 1 with PII...",
+ "Document 2 with more data...",
+ "Document 3..."
+]
-```bash
-datafog hash-text "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
+# Process multiple documents efficiently
+results = DataFog().batch_process(documents)
```
-generating an output which looks like this:
+---
-```bash
-5738a37f0af81594b8a8fd677e31b5e2cabd6d7791c89b9f0a1c233bb563ae39 is the CEO of f223faa96f22916294922b171a2696d868fd1f9129302eb41a45b2a2ea2ebbfd and is based out of ab5f41f04096cf7cd314357c4be26993eeebc0c094ca668506020017c35b7a9c, cad0535decc38b248b40e7aef9a1cfd91ce386fa5c46f05ea622649e7faf18fb
-```
+## Performance Benchmarks
-### Utility Commands
+Performance comparison with alternatives:
-#### ๐ฅ Health Check
+### Speed Comparison (10KB text)
-```bash
-datafog health
```
-
-#### โ๏ธ Show Configuration
-
-```bash
-datafog show-config
+DataFog Pattern: 4ms โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ 123x faster
+spaCy: 480ms โโ baseline
```
-#### ๐ฅ Download Model
+### Engine Selection Guide
-```bash
-datafog download-model en_core_web_sm
-```
+| Scenario | Recommended Engine | Why |
+| -------------------------- | ------------------ | ------------------------------------- |
+| **High-volume processing** | `pattern` | Maximum speed, consistent performance |
+| **Unknown entity types** | `spacy` | Broader entity recognition |
+| **General purpose** | `auto` | Smart fallback, best of both worlds |
+| **Real-time applications** | `pattern` | Sub-millisecond processing |
-#### ๐ Show Model Directory
+---
-```bash
-datafog show-spacy-model-directory en_core_web_sm
-```
+## CLI Usage
-#### ๐ List Models
+DataFog includes a command-line interface:
```bash
-datafog list-spacy-models
-```
+# Scan text for PII
+datafog scan-text "John's email is john@example.com"
-#### ๐ท๏ธ List Entities
+# Process images
+datafog scan-image document.png --operations extract,scan
-```bash
+# Anonymize data
+datafog redact-text "My phone is (555) 123-4567"
+datafog replace-text "SSN: 123-45-6789"
+datafog hash-text "Email: john@company.com" --hash-type sha256
+
+# Utility commands
+datafog health
datafog list-entities
+datafog show-config
```
---
-## โ ๏ธ Important Notes
+## Features
-- For `scan-image` and `scan-text` commands, use `--operations` to specify different operations. Default is `scan`.
-- Process multiple images or text strings in a single command by providing multiple arguments.
-- Ensure proper permissions and configuration of the DataFog service before running commands.
+### Security & Compliance
----
-
-๐ก **Tip:** For more detailed information on each command, use the `--help` option, e.g., `datafog scan-text --help`.
+- Detection of regulated data types for GDPR/CCPA compliance
+- Audit trails for tracking detection and anonymization
+- Configurable detection thresholds
-# Python SDK
+### Scalability
-## Getting Started
+- Batch processing for handling multiple documents
+- Memory-efficient processing for large files
+- Async support for non-blocking operations
-To use DataFog, you'll need to create a DataFog client with the desired operations. Here's a basic setup:
+### Integration Example
```python
+# FastAPI middleware example
+from fastapi import FastAPI
from datafog import DataFog
-# For text annotation
-client = DataFog(operations="scan")
-
-# For OCR (Optical Character Recognition)
-ocr_client = DataFog(operations="extract")
-```
-
-## Engine Selection
-
-DataFog now supports multiple annotation engines through the `TextService` class. You can choose between different engines for PII detection:
-
-```python
-from datafog.services.text_service import TextService
-
-# Use fast engine only (fastest, pattern-based detection)
-fast_service = TextService(engine="regex")
+app = FastAPI()
+detector = DataFog()
-# Use spaCy engine only (more comprehensive NLP-based detection)
-spacy_service = TextService(engine="spacy")
-
-# Use auto mode (default) - tries fast engine first, falls back to spaCy if no entities found
-auto_service = TextService() # engine="auto" is the default
-```
-
-Each engine targets different PII detection needs:
-
-- **regex**: Pattern-based detection optimized for structured identifiers like emails, phone numbers, credit cards, SSNs, and IP addresses
-- **spacy**: NLP-based entity recognition for contextual entities like names, organizations, locations, dates, and monetary amounts
-- **auto**: Intelligent selection - tries pattern-based detection first, falls back to NLP for comprehensive contextual analysis
-
-## Text PII Annotation
-
-Here's an example of how to annotate PII in a text document:
-
-```
-import requests
-
-# Fetch sample medical record
-doc_url = "https://gist.githubusercontent.com/sidmohan0/b43b72693226422bac5f083c941ecfdb/raw/b819affb51796204d59987893f89dee18428ed5d/note1.txt"
-response = requests.get(doc_url)
-text_lines = [line for line in response.text.splitlines() if line.strip()]
-
-# Run annotation
-annotations = client.run_text_pipeline_sync(str_list=text_lines)
-print(annotations)
+@app.middleware("http")
+async def redact_pii_middleware(request, call_next):
+ # Automatically scan/redact request data
+ pass
```
-## OCR PII Annotation
-
-For OCR capabilities, you can use the following:
-
-```
-import asyncio
-import nest_asyncio
-
-nest_asyncio.apply()
-
+---
-async def run_ocr_pipeline_demo():
- image_url = "https://s3.amazonaws.com/thumbnails.venngage.com/template/dc377004-1c2d-49f2-8ddf-d63f11c8d9c2.png"
- results = await ocr_client.run_ocr_pipeline(image_urls=[image_url])
- print("OCR Pipeline Results:", results)
+## Common Use Cases
+### Enterprise
-loop = asyncio.get_event_loop()
-loop.run_until_complete(run_ocr_pipeline_demo())
-```
+- Log sanitization
+- Data migration with PII handling
+- Compliance reporting and audits
-Note: The DataFog library uses asynchronous programming for OCR, so make sure to use the `async`/`await` syntax when calling the appropriate methods.
+### Data Science
-## Text Anonymization
+- Dataset preparation and anonymization
+- Privacy-preserving analytics
+- Research compliance
-DataFog provides various anonymization techniques to protect sensitive information. Here are examples of how to use them:
+### Development
-### Redacting Text
+- Test data generation
+- Code review for PII detection
+- API security validation
-To redact PII in text:
+---
-```python
-from datafog import DataFog
-from datafog.config import OperationType
+## Installation & Setup
-client = DataFog(operations=[OperationType.SCAN, OperationType.REDACT])
+### Basic Installation
-text = "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-redacted_text = client.run_text_pipeline_sync([text])[0]
-print(redacted_text)
+```bash
+pip install datafog
```
-Output:
+### Development Setup
+```bash
+git clone https://github.com/datafog/datafog-python
+cd datafog-python
+python -m venv .venv
+source .venv/bin/activate # On Windows: .venv\Scripts\activate
+pip install -r requirements-dev.txt
+just setup
```
-[REDACTED] is the CEO of [REDACTED] and is based out of [REDACTED], [REDACTED]
-```
-
-### Replacing Text
-To replace detected PII with unique identifiers:
+### Docker Usage
-```python
-from datafog import DataFog
-from datafog.config import OperationType
-
-client = DataFog(operations=[OperationType.SCAN, OperationType.REPLACE])
-
-text = "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-replaced_text = client.run_text_pipeline_sync([text])[0]
-print(replaced_text)
+```dockerfile
+FROM python:3.10-slim
+RUN pip install datafog
+COPY . .
+CMD ["python", "your_script.py"]
```
-Output:
+---
-```
-[PERSON_B86CACE6] is the CEO of [UNKNOWN_445944D7] and is based out of [UNKNOWN_32BA5DCA], [UNKNOWN_B7DF4969]
-```
+## Contributing
-### Hashing Text
+Contributions are welcome in the form of:
-To hash detected PII:
+- Bug reports
+- Feature requests
+- Documentation improvements
+- New pattern patterns for PII detection
+- Performance improvements
-```python
-from datafog import DataFog
-from datafog.config import OperationType
-from datafog.models.anonymizer import HashType
+### Quick Contribution Guide
-client = DataFog(operations=[OperationType.SCAN, OperationType.HASH], hash_type=HashType.SHA256)
+```bash
+# Setup development environment
+git clone https://github.com/datafog/datafog-python
+cd datafog-python
+just setup
-text = "Tim Cook is the CEO of Apple and is based out of Cupertino, California"
-hashed_text = client.run_text_pipeline_sync([text])[0]
-print(hashed_text)
-```
+# Run tests
+just test
-Output:
+# Format code
+just format
-```
-5738a37f0af81594b8a8fd677e31b5e2cabd6d7791c89b9f0a1c233bb563ae39 is the CEO of f223faa96f22916294922b171a2696d868fd1f9129302eb41a45b2a2ea2ebbfd and is based out of ab5f41f04096cf7cd314357c4be26993eeebc0c094ca668506020017c35b7a9c, cad0535decc38b248b40e7aef9a1cfd91ce386fa5c46f05ea622649e7faf18fb
+# Submit PR
+git checkout -b feature/your-improvement
+# Make your changes
+git commit -m "Add your improvement"
+git push origin feature/your-improvement
```
-You can choose from SHA256 (default), SHA3-256, and MD5 hashing algorithms by specifying the `hash_type` parameter
+See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
-## PII Detection Capabilities
+---
-DataFog provides multiple annotation engines designed for different PII detection scenarios:
+## Benchmarking & Performance
-### Engine Selection
+### Run Benchmarks Locally
-The `TextService` class supports three engine modes:
-
-```python
-# Use regex engine for structured identifiers
-regex_service = TextService(engine="regex")
+```bash
+# Install benchmark dependencies
+pip install pytest-benchmark
-# Use spaCy engine for contextual entities
-spacy_service = TextService(engine="spacy")
+# Run performance tests
+pytest tests/benchmark_text_service.py -v
-# Use auto mode (default) - intelligent engine selection
-auto_service = TextService() # engine="auto" is the default
+# Compare with baseline
+python scripts/run_benchmark_locally.sh
```
-### PII Coverage by Engine
-
-Different engines excel at detecting different types of personally identifiable information:
-
-| Engine | PII Types Detected | Best For |
-| ------ | ------------------------------------------------------ | ------------------------------------------------------- |
-| Regex | EMAIL, PHONE, SSN, CREDIT_CARD, IP_ADDRESS, DOB, ZIP | Financial services, healthcare, compliance |
-| SpaCy | PERSON, ORG, GPE, CARDINAL, DATE, TIME, MONEY, PRODUCT | Legal documents, communication monitoring, general text |
-| Auto | All of the above (context-dependent) | Mixed data sources, unknown content types |
-
-### Industry-Specific Use Cases
-
-**Financial Services & Healthcare:**
-
-- Primary need: Structured identifiers (SSNs, credit cards, account numbers)
-- Recommended: `regex` engine for high precision on regulatory requirements
-- Common PII: ~60% structured identifiers, ~40% names/addresses
-
-**Legal & Document Review:**
-
-- Primary need: Names, organizations, locations in unstructured text
-- Recommended: `spacy` engine for comprehensive entity recognition
-- Common PII: ~30% structured identifiers, ~70% contextual entities
+### Continuous Performance Monitoring
-**Enterprise Communication & Mixed Content:**
+Our CI pipeline:
-- Primary need: Both structured and contextual PII detection
-- Recommended: `auto` engine for intelligent selection
-- Benefits from both engines depending on content type
+- Runs benchmarks on every PR
+- Compares against baseline performance
+- Fails builds if performance degrades >10%
+- Tracks performance trends over time
-### When to Use Each Engine
-
-**Regex Engine**: Choose when you need to detect specific, well-formatted identifiers:
-
-- Processing structured databases or forms
-- Compliance scanning for specific regulatory requirements (GDPR, HIPAA)
-- High-volume processing where deterministic results are important
-- Financial data with credit cards, SSNs, account numbers
-
-**SpaCy Engine**: Choose when you need contextual understanding:
-
-- Analyzing unstructured documents, emails, or communications
-- Legal eDiscovery where names and organizations are key
-- Content where entities don't follow standard patterns
-- Multi-language support requirements
-
-**Auto Engine**: Choose for general-purpose PII detection:
-
-- Unknown or mixed content types
-- Applications serving multiple industries
-- When you want comprehensive coverage without manual engine selection
-- Default choice for most production applications
-
-### Running Detection Tests
-
-You can test the different engines locally using pytest:
-
-```bash
-pip install pytest-benchmark
-pytest tests/benchmark_text_service.py -v
-```
+---
-## Examples
-
-For more detailed examples, check out our Jupyter notebooks in the `examples/` directory:
-
-- `text_annotation_example.ipynb`: Demonstrates text PII annotation
-- `image_processing.ipynb`: Shows OCR capabilities and text extraction from images
-
-These notebooks provide step-by-step guides on how to use DataFog for various tasks.
-
-### Dev Notes
-
-For local development:
-
-1. Clone the repository.
-2. Navigate to the project directory:
- ```
- cd datafog-python
- ```
-3. Create a new virtual environment (using `.venv` is recommended as it is hardcoded in the justfile):
- ```
- python -m venv .venv
- ```
-4. Activate the virtual environment:
- - On Windows:
- ```
- .venv\Scripts\activate
- ```
- - On macOS/Linux:
- ```
- source .venv/bin/activate
- ```
-5. Install the package in editable mode:
- ```
- pip install -r requirements-dev.txt
- ```
-6. Set up the project:
- ```
- just setup
- ```
-
-Now, you can develop and run the project locally.
-
-#### Important Actions:
-
-- **Format the code**:
- ```
- just format
- ```
- This runs `isort` to sort imports.
-- **Lint the code**:
- ```
- just lint
- ```
- This runs `flake8` to check for linting errors.
-- **Generate coverage report**:
- ```
- just coverage-html
- ```
- This runs `pytest` and generates a coverage report in the `htmlcov/` directory.
-
-We use [pre-commit](https://marketplace.visualstudio.com/items?itemName=elagil.pre-commit-helper) to run checks locally before committing changes. Once installed, you can run:
+## Documentation & Support
-```
-pre-commit run --all-files
-```
+| Resource | Link |
+| --------------------- | --------------------------------------------------------------------------- |
+| **Documentation** | [docs.datafog.ai](https://docs.datafog.ai) |
+| **Community Discord** | [Join here](https://discord.gg/bzDth394R4) |
+| **Bug Reports** | [GitHub Issues](https://github.com/datafog/datafog-python/issues) |
+| **Feature Requests** | [GitHub Discussions](https://github.com/datafog/datafog-python/discussions) |
+| **Support** | [hi@datafog.ai](mailto:hi@datafog.ai) |
-#### Dependencies
+---
-For OCR, we use Tesseract, which is incorporated into the build step. You can find the relevant configurations under `.github/workflows/` in the following files:
+## License & Acknowledgments
-- `dev-cicd.yml`
-- `feature-cicd.yml`
-- `main-cicd.yml`
+DataFog is released under the [MIT License](LICENSE).
-### Testing
+**Built with:**
-- Python 3.10
+- Pattern optimization for efficient processing
+- spaCy integration for NLP capabilities
+- Tesseract & Donut for OCR capabilities
+- Pydantic for data validation
-## License
+---
-This software is published under the [MIT
-license](https://en.wikipedia.org/wiki/MIT_License).
+[GitHub](https://github.com/datafog/datafog-python) โข [Documentation](https://docs.datafog.ai) โข [Discord](https://discord.gg/bzDth394R4)