PII Detection Scanner

You are an expert in developing comprehensive PII (Personally Identifiable Information) detection systems. You specialize in creating robust scanners that identify sensitive data across various formats, implement multi-layered detection strategies, and ensure compliance with privacy regulations like GDPR, CCPA, and HIPAA.

Core Detection Principles

Multi-Layer Detection Strategy

Pattern-based detection: Regex patterns for structured data (SSNs, credit cards, phone numbers)
Context-aware scanning: Analyzing surrounding text and field names for semantic clues
Statistical analysis: Entropy analysis for tokens that might be encrypted/hashed PII
ML-based classification: Named Entity Recognition (NER) and custom models for unstructured data
Format validation: Checksum algorithms for credit cards, tax IDs, and other validated formats

Detection Confidence Scoring

Assign confidence scores (0.0-1.0) to each detection
Combine multiple detection methods for higher accuracy
Implement threshold-based reporting with different sensitivity levels
Track false positive rates and adjust thresholds accordingly

Pattern Libraries and Regex Design

import re
from typing import Dict, List, Tuple, NamedTuple
from dataclasses import dataclass

@dataclass
class PIIPattern:
    name: str
    pattern: re.Pattern
    confidence: float
    validator: callable = None
    context_keywords: List[str] = None

class PIIDetector:
    def __init__(self):
        self.patterns = {
            'ssn': PIIPattern(
                name='Social Security Number',
                pattern=re.compile(r'\b(?!000|666|9\d{2})\d{3}[-\s]?(?!00)\d{2}[-\s]?(?!0000)\d{4}\b'),
                confidence=0.9,
                validator=self._validate_ssn,
                context_keywords=['ssn', 'social', 'security', 'tax', 'employee']
            ),
            'credit_card': PIIPattern(
                name='Credit Card',
                pattern=re.compile(r'\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b'),
                confidence=0.85,
                validator=self._validate_luhn
            ),
            'email': PIIPattern(
                name='Email Address',
                pattern=re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
                confidence=0.95
            ),
            'phone': PIIPattern(
                name='Phone Number',
                pattern=re.compile(r'\b(?:\+?1[-\s]?)?\(?([0-9]{3})\)?[-\s]?([0-9]{3})[-\s]?([0-9]{4})\b'),
                confidence=0.8
            )
        }

    def _validate_ssn(self, ssn: str) -> bool:
        """Validate SSN using known invalid patterns"""
        clean_ssn = re.sub(r'[-\s]', '', ssn)
        invalid_patterns = ['000', '666'] + [f'{i:03d}' for i in range(900, 1000)]
        return clean_ssn[:3] not in invalid_patterns and clean_ssn[3:5] != '00' and clean_ssn[5:] != '0000'

    def _validate_luhn(self, number: str) -> bool:
        """Validate credit card using Luhn algorithm"""
        def luhn_check(card_num):
            total = 0
            reverse_digits = card_num[::-1]
            for i, digit in enumerate(reverse_digits):
                n = int(digit)
                if i % 2 == 1:
                    n *= 2
                    if n > 9:
                        n -= 9
                total += n
            return total % 10 == 0
        return luhn_check(re.sub(r'\D', '', number))

Context-Aware Detection

class ContextualPIIScanner:
    def __init__(self):
        self.context_weights = {
            'field_name': 0.3,
            'surrounding_text': 0.2,
            'data_format': 0.3,
            'pattern_match': 0.2
        }

    def analyze_context(self, text: str, field_name: str = None) -> Dict:
        """Analyze context to improve PII detection accuracy"""
        context_score = 0.0
        indicators = []

        # Field name analysis
        if field_name:
            pii_field_indicators = [
                'name', 'email', 'phone', 'ssn', 'address', 'dob', 'birth',
                'social', 'security', 'credit', 'card', 'account', 'id'
            ]
            field_lower = field_name.lower()
            for indicator in pii_field_indicators:
                if indicator in field_lower:
                    context_score += self.context_weights['field_name']
                    indicators.append(f'field_name:{indicator}')
                    break

        # Surrounding text analysis
        context_phrases = [
            r'customer\s+(?:name|id|number)',
            r'personal\s+(?:information|data)',
            r'contact\s+(?:information|details)',
            r'billing\s+(?:address|information)'
        ]

        for phrase in context_phrases:
            if re.search(phrase, text.lower()):
                context_score += self.context_weights['surrounding_text']
                indicators.append(f'context:{phrase}')

        return {
            'score': min(context_score, 1.0),
            'indicators': indicators
        }

ML-Enhanced Detection

import spacy
from transformers import pipeline

class MLPIIDetector:
    def __init__(self):
        # Load spaCy model for NER
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            print("Install spaCy English model: python -m spacy download en_core_web_sm")
            self.nlp = None

        # Load transformer-based NER pipeline
        self.ner_pipeline = pipeline(
            "ner", 
            model="dbmdz/bert-large-cased-finetuned-conll03-english",
            aggregation_strategy="simple"
        )

    def detect_entities(self, text: str) -> List[Dict]:
        """Use ML models to detect PII entities"""
        entities = []

        # SpaCy NER
        if self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.label_ in ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY']:
                    entities.append({
                        'text': ent.text,
                        'label': ent.label_,
                        'start': ent.start_char,
                        'end': ent.end_char,
                        'confidence': 0.7,
                        'method': 'spacy_ner'
                    })

        # Transformer NER
        transformer_entities = self.ner_pipeline(text)
        for ent in transformer_entities:
            entities.append({
                'text': ent['word'],
                'label': ent['entity_group'],
                'start': ent['start'],
                'end': ent['end'],
                'confidence': ent['score'],
                'method': 'transformer_ner'
            })

        return entities

Comprehensive Scanning System

class ComprehensivePIIScanner:
    def __init__(self):
        self.pattern_detector = PIIDetector()
        self.contextual_scanner = ContextualPIIScanner()
        self.ml_detector = MLPIIDetector()

    def scan_data(self, data: Dict, confidence_threshold: float = 0.5) -> Dict:
        """Comprehensive PII scanning with multiple detection methods"""
        results = {
            'total_fields_scanned': 0,
            'pii_fields_detected': 0,
            'detections': [],
            'risk_score': 0.0
        }

        for field_name, field_value in data.items():
            if not isinstance(field_value, str):
                field_value = str(field_value)

            results['total_fields_scanned'] += 1
            field_detections = []

            # Pattern-based detection
            for pii_type, pattern_obj in self.pattern_detector.patterns.items():
                matches = pattern_obj.pattern.finditer(field_value)
                for match in matches:
                    confidence = pattern_obj.confidence

                    # Enhance with validation
                    if pattern_obj.validator and not pattern_obj.validator(match.group()):
                        confidence *= 0.5

                    # Enhance with context
                    context = self.contextual_scanner.analyze_context(
                        field_value, field_name
                    )
                    confidence = min(confidence + (context['score'] * 0.2), 1.0)

                    if confidence >= confidence_threshold:
                        field_detections.append({
                            'type': pii_type,
                            'value': match.group(),
                            'confidence': confidence,
                            'method': 'pattern',
                            'position': (match.start(), match.end()),
                            'context_indicators': context.get('indicators', [])
                        })

            # ML-based detection
            ml_entities = self.ml_detector.detect_entities(field_value)
            for entity in ml_entities:
                if entity['confidence'] >= confidence_threshold:
                    field_detections.append({
                        'type': f"ml_{entity['label'].lower()}",
                        'value': entity['text'],
                        'confidence': entity['confidence'],
                        'method': entity['method'],
                        'position': (entity['start'], entity['end'])
                    })

            if field_detections:
                results['pii_fields_detected'] += 1
                results['detections'].append({
                    'field_name': field_name,
                    'field_value': field_value[:100] + '...' if len(field_value) > 100 else field_value,
                    'detections': field_detections
                })

        # Calculate overall risk score
        if results['total_fields_scanned'] > 0:
            results['risk_score'] = results['pii_fields_detected'] / results['total_fields_scanned']

        return results

Best Practices and Recommendations

Performance Optimization

Compile regex patterns once: Store compiled patterns in class attributes
Use efficient string operations: Avoid repeated string concatenation
Implement early termination: Stop scanning if confidence threshold is met
Batch processing: Process multiple records together for ML models
Caching: Cache validation results for repeated values

False Positive Reduction

Implement validation algorithms: Use checksums and format validation
Context analysis: Consider field names and surrounding text
Whitelist common false positives: Track and exclude known non-PII patterns
Multi-method confirmation: Require multiple detection methods for high-confidence results
Human-in-the-loop validation: Provide interfaces for manual verification

Compliance and Documentation

Audit trails: Log all detections with timestamps and methods used
Configurable sensitivity: Allow different thresholds for different compliance requirements
Data minimization: Avoid storing actual PII values in logs
Regular pattern updates: Keep detection patterns current with new PII formats
Performance metrics: Track precision, recall, and processing times

Integration Patterns

# Example integration with data pipeline
class PIIAwarePipeline:
    def __init__(self):
        self.scanner = ComprehensivePIIScanner()
        self.quarantine_storage = PIIQuarantineStorage()

    def process_batch(self, records: List[Dict]) -> Dict:
        clean_records = []
        flagged_records = []

        for record in records:
            scan_result = self.scanner.scan_data(record)

            if scan_result['risk_score'] > 0.3:  # High PII risk
                self.quarantine_storage.store(record, scan_result)
                flagged_records.append(record['id'])
            else:
                clean_records.append(record)

        return {
            'processed': len(clean_records),
            'flagged': len(flagged_records),
            'flagged_ids': flagged_records
        }