#!/usr/bin/env python3
"""
Humanizer - Strip AI writing patterns from text.
Usage: python3 humanize.py <input-file> [--output <output-file>]
       cat file.txt | python3 humanize.py
       python3 humanize.py --test
"""

import re
import sys
import argparse
from pathlib import Path
from typing import List, Tuple, Dict

# AI patterns to detect and fix
HEDGING_PHRASES = [
    (r"\bIt's worth noting that\b", ""),
    (r"\bIt is worth noting that\b", ""),
    (r"\bIt's important to (note|remember|understand|consider)\b", ""),
    (r"\bIt is important to (note|remember|understand|consider)\b", ""),
    (r"\bIt's crucial to understand\b", ""),
    (r"\bIt is crucial to understand\b", ""),
    (r"\bIt should be noted that\b", ""),
    (r"\bOne might argue that\b", ""),
    (r"\bArguably,\b", ""),
    (r"\bTo some extent\b", ""),
    (r"\bIn a sense\b", ""),
    (r"\bAt the end of the day\b", "Bottom line:"),
    (r"\bIn conclusion\b", ""),
    (r"\bTo conclude\b", ""),
    (r"\bIn summary\b", ""),
    (r"\bTo summarize\b", ""),
]

STOCK_PHRASES = [
    (r"\bdelve into\b", "dig into"),
    (r"\bnavigate the landscape\b", "deal with"),
    (r"\bnavigate the complex landscape\b", "deal with"),
    (r"\bin today's world\b", ""),
    (r"\bin today's digital age\b", ""),
    (r"\brealm of\b", "field of"),
    (r"\btapestry of\b", ""),
    (r"\bever-evolving\b", "changing"),
    (r"\bat its core\b", ""),
    (r"\bholy grail\b", "goal"),
    (r"\bgame-changer\b", ""),
    (r"\bgame changer\b", ""),
    (r"\bparadigm shift\b", "shift"),
    (r"\bperfect storm\b", ""),
    (r"\bperfect blend\b", "mix"),
    (r"\bunleash\b", "release"),
    (r"\bunlock\b", "access"),
    (r"\bempower\b", "help"),
    (r"\bfoster\b", "build"),
    (r"\bleverage\b", "use"),
    (r"\butilize\b", "use"),
    (r"\bfacilitate\b", "help"),
    (r"\boptimize\b", "improve"),
    (r"\bstreamline\b", "simplify"),
    (r"\brobust\b", "strong"),
    (r"\bseamless\b", "smooth"),
    (r"\bcomprehensive\b", "complete"),
    (r"\bcutting-edge\b", "new"),
    (r"\bstate-of-the-art\b", "modern"),
    (r"\bembark on\b", "start"),
    (r"\bjourney\b", "process"),
    (r"\bharness\b", "use"),
    (r"\belevate\b", "raise"),
    (r"\btransformative\b", ""),
    (r"\bnuanced\b", ""),
    (r"\bmultifaceted\b", "complex"),
    (r"\bholistic\b", "complete"),
    (r"\bsynergy\b", ""),
    (r"\bsynergies\b", ""),
    (r"\bscale\b", "grow"),
    (r"\bscaling\b", "growing"),
]

TRANSITION_ABUSE = [
    (r"\bFurthermore,?\s*", ""),
    (r"\bMoreover,?\s*", ""),
    (r"\bAdditionally,?\s*", ""),
    (r"\bIn addition,?\s*", ""),
    (r"\bConsequently,?\s*", ""),
    (r"\bSubsequently,?\s*", ""),
    (r"\bNevertheless,?\s*", ""),
    (r"\bNonetheless,?\s*", ""),
    (r"\bNotwithstanding,?\s*", ""),
    (r"\bThus,?\s*", ""),
    (r"\bTherefore,?\s*", ""),
    (r"\bHence,?\s*", ""),
]

VAGUE_INTENSIFIERS = [
    (r"\bincredibly\b", ""),
    (r"\bremarkably\b", ""),
    (r"\bsignificantly\b", ""),
    (r"\bsubstantially\b", ""),
    (r"\btruly\b", ""),
    (r"\bextremely\b", "very"),
    (r"\bquite\b", ""),
    (r"\bsomewhat\b", ""),
    (r"\brather\b", ""),
    (r"\bfairly\b", ""),
    (r"\bsurprisingly\b", ""),
    (r"\binterestingly\b", ""),
]

REDUNDANT_PHRASES = [
    (r"\bin order to\b", "to"),
    (r"\bDue to the fact that\b", "Because"),
    (r"\bdue to the fact that\b", "because"),
    (r"\bAt this point in time\b", "Now"),
    (r"\bat this point in time\b", "now"),
    (r"\bIn the event that\b", "If"),
    (r"\bin the event that\b", "if"),
    (r"\bFor the purpose of\b", "To"),
    (r"\bfor the purpose of\b", "to"),
    (r"\bWith regard to\b", "About"),
    (r"\bwith regard to\b", "about"),
    (r"\bIn terms of\b", ""),
    (r"\bin terms of\b", ""),
    (r"\bthe fact that\b", ""),
    (r"\bA number of\b", "Several"),
    (r"\ba number of\b", "several"),
    (r"\bA variety of\b", "Various"),
    (r"\ba variety of\b", "various"),
    (r"\bIn today's\b", "In"),
    (r"\bin today's\b", "in"),
    (r"\bthe question of\b", ""),
    (r"\bthe issue of\b", ""),
    (r"\bthe matter of\b", ""),
]

PERFORMED_AUTHENTICITY = [
    (r"\bAs a \w+,\s*", ""),
    (r"\bSpeaking from experience,?\s*", ""),
    (r"\bI can tell you that\s*", ""),
    (r"\bLet me be clear:?\s*", ""),
    (r"\bHere's the thing:?\s*", ""),
    (r"\bThe truth is\s*", ""),
    (r"\bReal talk:?\s*", ""),
    (r"\bLet's dive in\.?\s*", ""),
    (r"\bLet's get started\.?\s*", ""),
    (r"\bI wanted to take a moment to\s*", ""),
    (r"\bI'd like to\s*", ""),
    (r"\bI would like to\s*", ""),
    (r"\bIt goes without saying that\s*", ""),
]

PASSIVE_VOICE = [
    (r"\bIt should be noted that\b", "Note that"),
    (r"\bIt can be seen that\b", "You can see that"),
    (r"\bThis can be achieved by\b", "You can do this by"),
    (r"\bSteps should be taken to\b", "Take steps to"),
    (r"\bConsideration should be given to\b", "Consider"),
    (r"\bIt has been determined that\b", ""),
    (r"\bIt was found that\b", ""),
    (r"\bThere is a need to\b", "You need to"),
    (r"\bThere are many\b", "There are"),
    (r"\bThere is a\b", "There's a"),
]

def clean_text(text: str) -> str:
    """Apply all cleaning rules to text."""
    result = text
    
    # Apply all pattern replacements
    all_patterns = (
        HEDGING_PHRASES +
        STOCK_PHRASES +
        TRANSITION_ABUSE +
        VAGUE_INTENSIFIERS +
        REDUNDANT_PHRASES +
        PERFORMED_AUTHENTICITY +
        PASSIVE_VOICE
    )
    
    for pattern, replacement in all_patterns:
        result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)
    
    # Clean up double spaces
    result = re.sub(r'  +', ' ', result)
    
    # Clean up spaces before punctuation
    result = re.sub(r' ([,.:;!?])', r'\1', result)
    
    # Clean up em dash overuse (keep max 1 per 300 words)
    em_dashes = result.count('—') + result.count('--')
    word_count = len(result.split())
    max_em_dashes = max(1, word_count // 300)
    
    if em_dashes > max_em_dashes:
        # Replace excess em dashes with commas or periods
        dash_pattern = r'—|—'
        parts = re.split(dash_pattern, result)
        if len(parts) > max_em_dashes + 1:
            # Keep first N em dashes, replace rest
            result = parts[0]
            for i, part in enumerate(parts[1:], 1):
                if i <= max_em_dashes:
                    result += '—' + part
                else:
                    # Use comma or period based on context
                    result += ',' + part
    
    return result.strip()


def check_ai_patterns(text: str) -> List[Tuple[str, int]]:
    """Check for remaining AI patterns and return warnings."""
    warnings = []
    
    # Check em dash density
    em_dashes = text.count('—')
    words = len(text.split())
    if words > 0 and em_dashes / words > 0.01:  # More than 1 per 100 words
        warnings.append(f"High em dash density: {em_dashes} in {words} words")
    
    # Check for remaining AI phrases
    ai_phrases = [
        "it's worth noting", "it is worth noting", "delve into",
        "in today's world", "navigate the landscape", "leverage",
        "utilize", "facilitate", "furthermore", "moreover",
        "additionally", "consequently", "subsequently"
    ]
    
    text_lower = text.lower()
    for phrase in ai_phrases:
        if phrase in text_lower:
            count = text_lower.count(phrase)
            warnings.append(f"Found '{phrase}': {count} time(s)")
    
    # Check for passive voice indicators
    passive_indicators = re.findall(r'\b\w+ed by\b', text_lower)
    if len(passive_indicators) > 2:
        warnings.append(f"Possible passive voice: {len(passive_indicators)} instances")
    
    # Check sentence length variation
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    if len(sentences) > 5:
        lengths = [len(s.split()) for s in sentences]
        avg_length = sum(lengths) / len(lengths)
        variance = sum((l - avg_length) ** 2 for l in lengths) / len(lengths)
        if variance < 25:  # Low variance = too uniform
            warnings.append("Sentence lengths too uniform (AI pattern)")
    
    return warnings


def run_regression_tests():
    """Run regression tests to catch patterns that keep coming back."""
    test_cases = [
        # (input, should_not_contain)
        ("It's worth noting that this is important.", "worth noting"),
        ("Let's delve into the details.", "delve into"),
        ("Furthermore, it's crucial to understand.", "furthermore"),
        ("In order to succeed, you must leverage these tools.", "in order to"),
        ("The solution is incredibly effective and robust.", "incredibly"),
        ("It should be noted that this works.", "should be noted"),
        ("At the end of the day, it's a game-changer.", "game-changer"),
        ("We need to navigate the complex landscape.", "navigate the"),
        ("This is truly remarkable and seamless.", "truly"),
        ("Let me be clear: this is important.", "let me be clear"),
    ]
    
    print("Running regression tests...")
    passed = 0
    failed = 0
    
    for input_text, should_not_contain in test_cases:
        result = clean_text(input_text)
        if should_not_contain.lower() in result.lower():
            print(f"  FAIL: '{input_text[:50]}...' still contains '{should_not_contain}'")
            print(f"        Result: '{result}'")
            failed += 1
        else:
            print(f"  PASS: '{input_text[:40]}...'")
            passed += 1
    
    print(f"\nResults: {passed} passed, {failed} failed")
    return failed == 0


def main():
    parser = argparse.ArgumentParser(description='Strip AI writing patterns from text')
    parser.add_argument('input', nargs='?', help='Input file path')
    parser.add_argument('--output', '-o', help='Output file path')
    parser.add_argument('--test', action='store_true', help='Run regression tests')
    parser.add_argument('--check', action='store_true', help='Check for remaining AI patterns')
    args = parser.parse_args()
    
    if args.test:
        success = run_regression_tests()
        sys.exit(0 if success else 1)
    
    # Read input
    if args.input:
        text = Path(args.input).read_text()
    elif not sys.stdin.isatty():
        text = sys.stdin.read()
    else:
        parser.print_help()
        sys.exit(1)
    
    # Process
    cleaned = clean_text(text)
    
    # Check for remaining patterns
    if args.check:
        warnings = check_ai_patterns(cleaned)
        if warnings:
            print("Warnings:", file=sys.stderr)
            for warning in warnings:
                print(f"  - {warning}", file=sys.stderr)
    
    # Output
    if args.output:
        Path(args.output).write_text(cleaned)
        print(f"Written to {args.output}")
    else:
        print(cleaned)


if __name__ == '__main__':
    main()
