#!/usr/bin/env python3
"""
Reddit Search: Uses self-hosted SearXNG metasearch engine
Prioritizes recent and high-score posts.
Usage: python search_reddit.py "<query>" [--subreddit <name>] [--days <N>] [--limit <N>]
"""

import sys
import json
import re
from datetime import datetime
from urllib.parse import quote
import requests

SEARXNG_URL = "http://127.0.0.1:8080/search"

def query_searxng(query, subreddit=None, limit=20):
    """Search Reddit via SearXNG API"""
    try:
        # Build search query with site filter
        search_query = f"site:reddit.com {query}"
        if subreddit:
            sub = subreddit.replace('r/', '')
            search_query += f" subreddit:{sub}"

        # SearXNG API parameters
        params = {
            'q': search_query,
            'format': 'json',
            'language': 'en',
            'engines': 'google,brave,startpage'  # Use multiple engines for better coverage
        }

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        response = requests.get(SEARXNG_URL, params=params, headers=headers, timeout=30)
        response.raise_for_status()

        data = response.json()
        raw_results = data.get('results', [])

        # Filter only Reddit URLs and format results
        results = []
        for r in raw_results:
            url = r.get('url', '')
            if 'reddit.com' not in url:
                continue

            # Extract subreddit from URL
            sub_match = re.search(r'reddit\.com/r/([^/]+)', url)
            subreddit_name = f"r/{sub_match.group(1)}" if sub_match else "Unknown"

            results.append({
                'title': r.get('title', ''),
                'url': url,
                'subreddit': subreddit_name,
                'source': r.get('engine', 'searxng'),
                'snippet': r.get('content', '')[:300],
                'author': 'N/A',
                'score': r.get('score', 0),
                'comments': 0,
                'created': r.get('publishedDate', 'N/A'),
                'created_ts': None,
                'engines': r.get('engines', [])
            })

            if len(results) >= limit:
                break

        return results
    except Exception as e:
        print(f"SearXNG search error: {e}", file=sys.stderr)
        return []

def score_results(results, query_terms):
    """
    Score results based on:
    - SearXNG score (aggregated from multiple engines)
    - Title match (primary relevance)
    - Content match
    - Engine diversity (more engines = higher reliability)
    """
    query_lower = query_terms.lower()
    terms = query_lower.split()

    for result in results:
        score = 0

        # Start with SearXNG's own score (normalized)
        searxng_score = result.get('score', 0) * 5  # Scale up
        score += searxng_score

        # Title match (highest priority for relevance)
        title_lower = result['title'].lower()
        for term in terms:
            if term in title_lower:
                score += 15  # High weight for exact title match

        # Snippet match
        snippet_lower = result['snippet'].lower()
        for term in terms:
            if term in snippet_lower:
                score += 5

        # Engine diversity bonus
        engines = result.get('engines', [])
        engine_diversity = len(engines) * 3
        score += engine_diversity

        result['relevance_score'] = int(score)

    # Sort by relevance score descending
    return sorted(results, key=lambda x: x['relevance_score'], reverse=True)

def format_results(results, limit=20):
    """Format results for display"""
    output = []
    output.append(f"🔍 Found {len(results)} results (top {min(limit, len(results))})")
    output.append("⚡ Source: SearXNG metasearch (Google, Brave, Startpage)")
    output.append("=" * 70)

    for i, r in enumerate(results[:limit], 1):
        # Source tag with engine info
        engines = r.get('engines', [])
        if engines:
            source_tag = f"[SEARXNG: {'+'.join(engines[:2])}]"
        else:
            source_tag = f"[{r['source'].upper()}]"

        output.append(f"\n{i}. {source_tag} {r['title']}")
        output.append(f"   📍 {r['subreddit']}")
        output.append(f"   🔗 {r['url']}")
        if r.get('snippet') and r['snippet'] != 'N/A':
            output.append(f"   💬 {r['snippet'][:150]}...")

        # Show published date if available
        if r.get('created') and r['created'] != 'N/A':
            output.append(f"   📅 {r['created']}")

    return '\n'.join(output)

def main():
    args = sys.argv[1:]

    if not args:
        print("Usage: python search_reddit.py '<query>' [--subreddit <name>] [--limit <N>]")
        print("Note: Uses self-hosted SearXNG metasearch engine.")
        sys.exit(1)

    query = args[0]
    subreddit = None
    limit = 20

    i = 1
    while i < len(args):
        if args[i] == '--subreddit' and i + 1 < len(args):
            subreddit = args[i + 1]
            i += 2
        elif args[i] == '--limit' and i + 1 < len(args):
            limit = int(args[i + 1])
            i += 2
        else:
            i += 1

    # Search SearXNG
    print(f"🔍 Searching Reddit for: '{query}'...")
    if subreddit:
        print(f"   Subreddit: {subreddit}")
    print(f"   Top results: {limit}")

    results = query_searxng(query, subreddit, limit=limit)

    # Score results
    if results:
        scored_results = score_results(results, query)
    else:
        scored_results = []

    # Format and output
    formatted = format_results(scored_results, limit)
    print("\n" + formatted)

    # Also print as JSON for programmatic use
    print("\n" + "=" * 70)
    print("JSON Output (top results):")
    print(json.dumps(scored_results[:limit], indent=2, ensure_ascii=False))

if __name__ == '__main__':
    main()
