#!/usr/bin/env python3
"""
Daily Cannabis News Scraper for CannaDealsFL Blog
Fetches cannabis news, generates blog posts via AI with header images, publishes to site.
Prevents duplicates by checking existing post titles.
FIXED: Images are now generated and uploaded BEFORE publishing.
"""

import os
import sys
import json
import re
import hashlib
import requests
import base64
from datetime import datetime, timedelta
from pathlib import Path
import feedparser

# API Configuration
API_BASE = "https://www.cannadealsfl.com/api/v1"
API_KEY = os.environ.get("ADMIN_API_KEY", "")

# News Sources
NEWS_SOURCES = [
    "https://news.google.com/rss/search?q=florida+medical+cannabis+marijuana&hl=en-US&gl=US&ceid=US:en",
    "https://news.google.com/rss/search?q=florida+dispensary+news+deals&hl=en-US&gl=US&ceid=US:en",
    "https://feeds.mjbizdaily.com/mjbizdaily",
]

# Gemini API for content generation
GEMINI_API_KEY = os.environ.get("GOOGLE_GENERATIVE_AI_API_KEY", "")
GEMINI_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"

# Image output directory (on production server)
IMAGE_OUTPUT_DIR = "/home/crogers2287/cannadealsfl/public/images/blog/headers"
IMAGE_BASE_URL = "https://cannadealsfl.com/images/blog/headers"

# Track processed articles to prevent duplicates
PROCESSED_FILE = "/home/crogers2287/cannadealsfl/logs/.news_processed.json"


def load_processed():
    try:
        with open(PROCESSED_FILE, 'r') as f:
            return json.load(f)
    except:
        return []


def save_processed(processed):
    os.makedirs(os.path.dirname(PROCESSED_FILE), exist_ok=True)
    processed = processed[-100:]
    with open(PROCESSED_FILE, 'w') as f:
        json.dump(processed, f)


def article_hash(title, link):
    return hashlib.md5(f"{title}|{link}".encode()).hexdigest()[:16]


def get_existing_posts():
    try:
        response = requests.get(
            f"{API_BASE}/blog?limit=50&is_published=true",
            headers={"X-API-Key": API_KEY},
            timeout=30
        )
        response.raise_for_status()
        data = response.json()
        return [p["title"].lower() for p in data.get("data", [])]
    except Exception as e:
        print(f"Warning: Could not fetch existing posts: {e}")
        return []


def fetch_news():
    articles = []
    for source in NEWS_SOURCES:
        try:
            feed = feedparser.parse(source)
            for entry in feed.entries[:10]:
                articles.append({
                    "title": entry.title,
                    "link": entry.link,
                    "summary": getattr(entry, "summary", ""),
                    "published": getattr(entry, "published", datetime.now().isoformat()),
                })
        except Exception as e:
            print(f"Error fetching {source}: {e}")
    return articles


def filter_new_articles(articles, processed, existing_titles):
    new_articles = []
    for article in articles:
        h = article_hash(article["title"], article["link"])
        title_lower = article["title"].lower()
        if h in processed:
            continue
        if any(title_lower[:50] in existing or existing in title_lower[:50] 
               for existing in existing_titles):
            print(f"  Skipping duplicate: {article['title'][:50]}...")
            continue
        new_articles.append((article, h))
    return new_articles


def generate_header_image(title, slug):
    """Generate header image via Gemini Imagen and save to disk."""
    # Ensure output directory exists
    os.makedirs(IMAGE_OUTPUT_DIR, exist_ok=True)
    
    # Clean filename
    safe_slug = re.sub(r'[^a-z0-9-]', '', slug)[:50]
    image_filename = f"{safe_slug}.png"
    image_path = os.path.join(IMAGE_OUTPUT_DIR, image_filename)
    
    # Check if image already exists
    if os.path.exists(image_path):
        print(f"  Image already exists: {image_filename}")
        return f"{IMAGE_BASE_URL}/{image_filename}"
    
    prompt = f"""Generate a professional, editorial-style header image for a cannabis industry news blog post.

Topic: {title}

Style requirements:
- Clean, modern design suitable for a news article
- Professional editorial aesthetic
- No text or words in the image
- Subtle gradient background (greens, blues, or warm tones)
- Abstract or stylized imagery, not photorealistic
- No visible cannabis products (compliance safe)
- 1200x630 aspect ratio for social sharing

Create a professional blog header image."""

    try:
        # Try Gemini 2.0 Flash with image generation
        response = requests.post(
            f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash-exp-image-generation:generateContent?key={GEMINI_API_KEY}",
            json={
                "contents": [{"parts": [{"text": prompt}]}],
                "generationConfig": {
                    "responseModalities": ["image", "text"],
                    "temperature": 0.8
                }
            },
            timeout=120
        )
        
        if response.status_code == 200:
            result = response.json()
            for part in result.get("candidates", [{}])[0].get("content", {}).get("parts", []):
                if "inlineData" in part:
                    # Decode and save image
                    image_data = base64.b64decode(part["inlineData"]["data"])
                    with open(image_path, 'wb') as f:
                        f.write(image_data)
                    print(f"  ✅ Image saved: {image_filename}")
                    return f"{IMAGE_BASE_URL}/{image_filename}"
        
        print(f"  ⚠️ Image generation returned {response.status_code}, using fallback")
    except Exception as e:
        print(f"  ⚠️ Image generation error: {e}")
    
    # Fallback: use an existing header image
    fallbacks = [
        "florida-dispensary-hero.png",
        "01-florida-dispensary-deals.png",
    ]
    for fb in fallbacks:
        fb_path = os.path.join(IMAGE_OUTPUT_DIR, fb)
        if os.path.exists(fb_path):
            print(f"  Using fallback: {fb}")
            return f"{IMAGE_BASE_URL}/{fb}"
    
    # Last resort: no image
    return None


def generate_blog_post(articles):
    if not articles:
        return None
    
    articles_text = "\n\n".join([
        f"Title: {a['title']}\nSummary: {a['summary']}\nLink: {a['link']}"
        for a in articles[:3]
    ])
    
    today = datetime.now().strftime("%B %d, %Y")
    
    prompt = f"""You are a cannabis industry blogger for CannaDealsFL, a Florida medical cannabis deal aggregator website.

Create an engaging, SEO-optimized blog post about recent cannabis news relevant to Florida MMJ patients.

Date: {today}

News sources to reference:
{articles_text}

Requirements:
1. Create a catchy title that includes "Florida Cannabis" or "Florida MMJ"
2. Write 500-700 words in a conversational, informative tone
3. Start with a brief intro about why staying informed matters
4. Summarize the key news in your own words (never copy verbatim)
5. Include practical takeaways for Florida medical marijuana patients
6. End with a call-to-action mentioning CannaDealsFL.com for deals
7. Use proper HTML formatting: <h2>, <h3>, <p>, <ul>, <li>, <strong>, <blockquote>
8. Include keywords: Florida medical marijuana, cannabis deals, dispensary discounts, MMJ

Output ONLY valid JSON (no markdown code blocks):
{{
  "title": "Post Title Here",
  "excerpt": "1-2 sentence summary for SEO",
  "meta_description": "150-160 character SEO description",
  "content": "Full HTML content here with proper tags",
  "tags": ["tag1", "tag2", "tag3"],
  "focus_keyword": "main SEO keyword"
}}"""

    try:
        response = requests.post(
            GEMINI_URL,
            json={
                "contents": [{"parts": [{"text": prompt}]}],
                "generationConfig": {"temperature": 0.7, "maxOutputTokens": 3000}
            },
            timeout=90
        )
        response.raise_for_status()
        
        result = response.json()
        text = result["candidates"][0]["content"]["parts"][0]["text"]
        
        text = re.sub(r'^```json\s*', '', text)
        text = re.sub(r'\s*```$', '', text)
        json_match = re.search(r'\{[\s\S]*\}', text)
        if json_match:
            return json.loads(json_match.group())
    except Exception as e:
        print(f"Error generating content: {e}")
    
    return None


def create_slug(title):
    slug = re.sub(r'[^\w\s-]', '', title.lower())
    slug = re.sub(r'[-\s]+', '-', slug)
    return slug[:80]


def publish_post(post_data, image_url=None):
    if not API_KEY:
        print("Error: ADMIN_API_KEY not set")
        return False
    
    slug = create_slug(post_data["title"])
    now = datetime.now().isoformat()
    
    payload = {
        "title": post_data["title"],
        "slug": slug,
        "content": post_data["content"],
        "excerpt": post_data.get("excerpt", post_data.get("meta_description", "")),
        "meta_description": post_data["meta_description"],
        "meta_title": post_data["title"],
        "content_type": "news",
        "tags": post_data.get("tags", ["cannabis news", "florida"]),
        "featured_image_url": image_url,
        "is_published": True,
        "publish_date": now,
        "seo_score": 85,
    }
    
    try:
        response = requests.post(
            f"{API_BASE}/blog",
            json=payload,
            headers={"X-API-Key": API_KEY, "Content-Type": "application/json"},
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        print(f"✅ Published: {post_data['title']}")
        print(f"   URL: https://cannadealsfl.com/blog/{slug}")
        return True
    except Exception as e:
        print(f"❌ Failed to publish: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"   Response: {e.response.text[:200]}")
        return False


def main():
    print(f"🌿 Starting cannabis news scan at {datetime.now()}")
    
    if not API_KEY:
        print("Error: Set ADMIN_API_KEY environment variable")
        sys.exit(1)
    if not GEMINI_API_KEY:
        print("Error: Set GOOGLE_GENERATIVE_AI_API_KEY environment variable")
        sys.exit(1)
    
    processed = load_processed()
    print(f"📚 Loaded {len(processed)} previously processed articles")
    
    print("🔍 Checking existing blog posts...")
    existing_titles = get_existing_posts()
    print(f"   Found {len(existing_titles)} existing posts")
    
    print("📰 Fetching news...")
    articles = fetch_news()
    print(f"   Found {len(articles)} articles")
    
    if not articles:
        print("No articles found, exiting")
        return
    
    print("🔎 Filtering duplicates...")
    new_articles = filter_new_articles(articles, processed, existing_titles)
    print(f"   {len(new_articles)} new articles to process")
    
    if not new_articles:
        print("No new articles to process, exiting")
        return
    
    articles_to_use = [a for a, h in new_articles[:3]]
    hashes_to_save = [h for a, h in new_articles[:3]]
    
    print("🤖 Generating blog post with AI...")
    post = generate_blog_post(articles_to_use)
    
    if not post:
        print("Failed to generate post")
        return
    
    print(f"   Generated: {post['title']}")
    
    # IMPORTANT: Generate image BEFORE publishing
    slug = create_slug(post['title'])
    print("🎨 Generating header image...")
    image_url = generate_header_image(post['title'], slug)
    if image_url:
        print(f"   ✅ Image: {image_url}")
    else:
        print("   ⚠️ No image available")
    
    # Now publish with image already in place
    print("🚀 Publishing...")
    if publish_post(post, image_url):
        processed.extend(hashes_to_save)
        save_processed(processed)
        print("✅ Done!")
    else:
        print("❌ Failed to publish")


if __name__ == "__main__":
    main()