#!/usr/bin/env python3
"""
Daily Cannabis News Scraper for CannaDealsFL Blog
Fetches cannabis news, generates blog posts via AI with header images, publishes to site.
Prevents duplicates by checking existing post titles.
"""

import os
import sys
import json
import re
import hashlib
import requests
import subprocess
from datetime import datetime, timedelta
from pathlib import Path
import feedparser

# API Configuration
API_BASE = "https://www.cannadealsfl.com/api/v1"
API_KEY = os.environ.get("ADMIN_API_KEY", "")

# News Sources
NEWS_SOURCES = [
    "https://news.google.com/rss/search?q=florida+medical+cannabis+marijuana&hl=en-US&gl=US&ceid=US:en",
    "https://news.google.com/rss/search?q=florida+dispensary+news+deals&hl=en-US&gl=US&ceid=US:en",
    "https://feeds.mjbizdaily.com/mjbizdaily",
]

# Gemini API for content generation
GEMINI_API_KEY = os.environ.get("GOOGLE_GENERATIVE_AI_API_KEY", "")
GEMINI_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"

# Track processed articles to prevent duplicates
PROCESSED_FILE = "/home/crogers2287/cannadealsfl/logs/.news_processed.json"


def load_processed():
    """Load list of processed article hashes."""
    try:
        with open(PROCESSED_FILE, 'r') as f:
            return json.load(f)
    except:
        return []


def save_processed(processed):
    """Save processed article hashes."""
    os.makedirs(os.path.dirname(PROCESSED_FILE), exist_ok=True)
    # Keep only last 100 entries
    processed = processed[-100:]
    with open(PROCESSED_FILE, 'w') as f:
        json.dump(processed, f)


def article_hash(title, link):
    """Create hash for article identification."""
    return hashlib.md5(f"{title}|{link}".encode()).hexdigest()[:16]


def get_existing_posts():
    """Get existing blog post titles to prevent duplicates."""
    try:
        response = requests.get(
            f"{API_BASE}/blog?limit=50&is_published=true",
            headers={"X-API-Key": API_KEY},
            timeout=30
        )
        response.raise_for_status()
        data = response.json()
        return [p["title"].lower() for p in data.get("data", [])]
    except Exception as e:
        print(f"Warning: Could not fetch existing posts: {e}")
        return []


def fetch_news():
    """Fetch news from RSS feeds."""
    articles = []
    
    for source in NEWS_SOURCES:
        try:
            feed = feedparser.parse(source)
            for entry in feed.entries[:10]:
                articles.append({
                    "title": entry.title,
                    "link": entry.link,
                    "summary": getattr(entry, "summary", ""),
                    "published": getattr(entry, "published", datetime.now().isoformat()),
                })
        except Exception as e:
            print(f"Error fetching {source}: {e}")
    
    return articles


def filter_new_articles(articles, processed, existing_titles):
    """Filter out already processed articles and duplicates."""
    new_articles = []
    for article in articles:
        h = article_hash(article["title"], article["link"])
        title_lower = article["title"].lower()
        
        # Skip if already processed
        if h in processed:
            continue
        
        # Skip if similar title exists
        if any(title_lower[:50] in existing or existing in title_lower[:50] 
               for existing in existing_titles):
            print(f"  Skipping duplicate: {article['title'][:50]}...")
            continue
        
        new_articles.append((article, h))
    
    return new_articles


def generate_header_image(title, slug):
    """Generate a header image using nano-banana-pro (Gemini image generation) and upload to carl."""
    import subprocess
    
    prompt = f"""Professional blog header image for a Florida medical cannabis website.
Topic: {title}
Style: Clean, modern design with cannabis plant elements (leaves, green tones)
Requirements:
- Professional editorial aesthetic suitable for a news article
- No text or words in the image
- Subtle gradient background in greens/purples
- Abstract or stylized cannabis imagery
- Horizontal banner format (2K resolution)
Create an image for a Florida medical cannabis news website."""
    
    # Generate image using nano-banana-pro skill
    temp_path = f"/tmp/{slug}-header.png"
    
    try:
        result = subprocess.run([
            "uv", "run", 
            "/usr/lib/node_modules/openclaw/skills/nano-banana-pro/scripts/generate_image.py",
            "--prompt", prompt,
            "--filename", temp_path,
            "--resolution", "2K"
        ], env={**os.environ, "GEMINI_API_KEY": GEMINI_API_KEY}, 
           capture_output=True, text=True, timeout=120, cwd="/usr/lib/node_modules/openclaw/skills/nano-banana-pro")
        
        if result.returncode != 0 or not os.path.exists(temp_path):
            print(f"  ❌ Image generation failed: {result.stderr[:200]}")
            return None
        
        print(f"  ✅ Image generated: {temp_path}")
        
        # Upload to carl
        remote_path = f"~/cannadealsfl/public/images/blog/headers/{slug}.png"
        upload_result = subprocess.run([
            "scp", temp_path, f"carl:{remote_path}"
        ], capture_output=True, text=True, timeout=30)
        
        if upload_result.returncode != 0:
            print(f"  ❌ Upload failed: {upload_result.stderr}")
            return None
        
        # Verify upload
        verify = subprocess.run([
            "ssh", "carl", f"ls -la {remote_path}"
        ], capture_output=True, text=True, timeout=10)
        
        if verify.returncode != 0:
            print(f"  ❌ Upload verification failed")
            return None
        
        image_url = f"https://cannadealsfl.com/images/blog/headers/{slug}.png"
        print(f"  ✅ Image uploaded: {image_url}")
        
        # Cleanup temp file
        os.remove(temp_path)
        
        return image_url
        
    except subprocess.TimeoutExpired:
        print("  ❌ Image generation timed out")
        return None
    except Exception as e:
        print(f"  ❌ Image generation error: {e}")
        return None


def generate_blog_post(articles):
    """Generate a blog post using Gemini AI."""
    if not articles:
        return None
    
    # Create prompt from articles
    articles_text = "\n\n".join([
        f"Title: {a['title']}\nSummary: {a['summary']}\nLink: {a['link']}"
        for a in articles[:3]
    ])
    
    today = datetime.now().strftime("%B %d, %Y")
    
    prompt = f"""You are a cannabis industry blogger for CannaDealsFL, a Florida medical cannabis deal aggregator website.

Create an engaging, SEO-optimized blog post about recent cannabis news relevant to Florida MMJ patients.

Date: {today}

News sources to reference:
{articles_text}

Requirements:
1. Create a catchy title that includes "Florida Cannabis" or "Florida MMJ"
2. Write 500-700 words in a conversational, informative tone
3. Start with a brief intro about why staying informed matters
4. Summarize the key news in your own words (never copy verbatim)
5. Include practical takeaways for Florida medical marijuana patients
6. End with a call-to-action mentioning CannaDealsFL.com for deals
7. Use proper HTML formatting: <h2>, <h3>, <p>, <ul>, <li>, <strong>, <blockquote>
8. Include keywords: Florida medical marijuana, cannabis deals, dispensary discounts, MMJ

Output ONLY valid JSON (no markdown code blocks):
{{
  "title": "Post Title Here",
  "excerpt": "1-2 sentence summary for SEO",
  "meta_description": "150-160 character SEO description",
  "content": "Full HTML content here with proper tags",
  "tags": ["tag1", "tag2", "tag3"],
  "focus_keyword": "main SEO keyword"
}}"""

    try:
        response = requests.post(
            GEMINI_URL,
            json={
                "contents": [{"parts": [{"text": prompt}]}],
                "generationConfig": {"temperature": 0.7, "maxOutputTokens": 3000}
            },
            timeout=90
        )
        response.raise_for_status()
        
        result = response.json()
        text = result["candidates"][0]["content"]["parts"][0]["text"]
        
        # Extract JSON from response (handle markdown code blocks)
        text = re.sub(r'^```json\s*', '', text)
        text = re.sub(r'\s*```$', '', text)
        json_match = re.search(r'\{[\s\S]*\}', text)
        if json_match:
            return json.loads(json_match.group())
    except Exception as e:
        print(f"Error generating content: {e}")
    
    return None


def create_slug(title):
    """Create URL-friendly slug from title."""
    slug = re.sub(r'[^\w\s-]', '', title.lower())
    slug = re.sub(r'[-\s]+', '-', slug)
    return slug[:80]


def publish_post(post_data, image_url=None):
    """Publish blog post via API."""
    if not API_KEY:
        print("Error: ADMIN_API_KEY not set")
        return False
    
    slug = create_slug(post_data["title"])
    now = datetime.now().isoformat()
    
    payload = {
        "title": post_data["title"],
        "slug": slug,
        "content": post_data["content"],
        "excerpt": post_data.get("excerpt", post_data.get("meta_description", "")),
        "meta_description": post_data["meta_description"],
        "meta_title": post_data["title"],
        "content_type": "news",
        "tags": post_data.get("tags", ["cannabis news", "florida"]),
        "featured_image_url": image_url,
        "is_published": True,
        "publish_date": now,
        "seo_score": 85,
    }
    
    try:
        response = requests.post(
            f"{API_BASE}/blog",
            json=payload,
            headers={"X-API-Key": API_KEY, "Content-Type": "application/json"},
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        print(f"✅ Published: {post_data['title']}")
        print(f"   URL: https://cannadealsfl.com/blog/{slug}")
        return True
    except Exception as e:
        print(f"❌ Failed to publish: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text[:200]}")
        return False


def main():
    """Main entry point."""
    print(f"🌿 Starting cannabis news scan at {datetime.now()}")
    
    # Check required env vars
    if not API_KEY:
        print("Error: Set ADMIN_API_KEY environment variable")
        sys.exit(1)
    if not GEMINI_API_KEY:
        print("Error: Set GOOGLE_GENERATIVE_AI_API_KEY environment variable")
        sys.exit(1)
    
    # Load processed articles
    processed = load_processed()
    print(f"📚 Loaded {len(processed)} previously processed articles")
    
    # Get existing posts
    print("🔍 Checking existing blog posts...")
    existing_titles = get_existing_posts()
    print(f"   Found {len(existing_titles)} existing posts")
    
    # Fetch news
    print("📰 Fetching news...")
    articles = fetch_news()
    print(f"   Found {len(articles)} articles")
    
    if not articles:
        print("No articles found, exiting")
        return
    
    # Filter new articles
    print("🔎 Filtering duplicates...")
    new_articles = filter_new_articles(articles, processed, existing_titles)
    print(f"   {len(new_articles)} new articles to process")
    
    if not new_articles:
        print("No new articles to process, exiting")
        return
    
    # Take only the top 3 new articles for one post
    articles_to_use = [a for a, h in new_articles[:3]]
    hashes_to_save = [h for a, h in new_articles[:3]]
    
    # Generate blog post
    print("🤖 Generating blog post with AI...")
    post = generate_blog_post(articles_to_use)
    
    if not post:
        print("Failed to generate post")
        return
    
    print(f"   Generated: {post['title']}")
    
    # Create slug first (needed for image)
    slug = create_slug(post['title'])
    
    # Generate header image (REQUIRED - no publish without image)
    print("🎨 Generating header image...")
    image_url = generate_header_image(post['title'], slug)
    
    if not image_url:
        print("❌ FAILED: Image generation failed - NOT publishing without image")
        print("   This prevents broken images on the site")
        return
    
    # Add cache-busting query param to defeat Cloudflare 404 cache
    cache_buster = int(datetime.now().timestamp())
    image_url_with_cache = f"{image_url}?v={cache_buster}"
    
    print(f"   ✅ Image: {image_url_with_cache}")
    
    # Publish
    print("🚀 Publishing...")
    if publish_post(post, image_url_with_cache):
        # Save processed hashes
        processed.extend(hashes_to_save)
        save_processed(processed)
        
        # Run vision validation
        print("🔍 Running post-publish validation...")
        validation_result = subprocess.run([
            "/home/crogers2287/.openclaw/agents/cannadeals-marketing/scripts/validate-blog-post.sh",
            f"https://cannadealsfl.com/blog/{slug}"
        ], capture_output=True, text=True, timeout=120)
        
        print(validation_result.stdout)
        if validation_result.returncode != 0:
            print("⚠️  Validation failed - check the post manually")
            print(validation_result.stderr)
        
        print("✅ Done!")
    else:
        print("❌ Failed to publish")


if __name__ == "__main__":
    main()
