<?php

namespace Intucart\Utils;

use League\HTMLToMarkdown\HtmlConverter;

/**
 * StringUtils class
 *
 * Provides reusable string manipulation and cleaning utilities
 * for the IntUCart plugin codebase.
 */
class StringUtils
{
    /**
     * Clean text by stripping HTML tags and decoding entities
     *
     * This method provides comprehensive text cleaning for user-generated content
     * like taxonomy terms, product names, descriptions, etc.
     *
     * @param string|null $text Text to clean
     * @return string Cleaned text
     */
    public static function cleanText(?string $text): string
    {
        if (empty($text)) {
            return '';
        }

        // Strip HTML tags first
        $cleaned = wp_strip_all_tags($text, true);

        // Decode HTML entities (like &amp; to &, &quot; to ", etc.)
        $cleaned = html_entity_decode($cleaned, ENT_QUOTES, 'UTF-8');

        // Trim whitespace
        $cleaned = trim($cleaned);

        return $cleaned;
    }

    /**
     * Clean and normalize text for search/comparison purposes
     *
     * Similar to cleanText but also normalizes for consistent comparison:
     * - Converts to lowercase
     * - Removes extra whitespace
     * - Optionally removes special characters
     *
     * @param string|null $text Text to normalize
     * @param bool $removeSpecialChars Whether to remove special characters
     * @return string Normalized text
     */
    public static function normalizeText(?string $text, bool $removeSpecialChars = false): string
    {
        if (empty($text)) {
            return '';
        }

        // Start with basic cleaning
        $normalized = self::cleanText($text);

        // Convert to lowercase
        $normalized = strtolower($normalized);

        // Normalize whitespace (convert multiple spaces/tabs/newlines to single space)
        $normalized = preg_replace('/\s+/', ' ', $normalized);

        // Remove special characters if requested
        if ($removeSpecialChars) {
            $normalized = preg_replace('/[^a-z0-9\s\-_]/', '', $normalized);
        }

        // Final trim
        $normalized = trim($normalized);

        return $normalized;
    }

    /**
     * Truncate text to specified length with ellipsis
     *
     * @param string|null $text Text to truncate
     * @param int $length Maximum length (default: 100)
     * @param string $suffix Suffix to append when truncated (default: '...')
     * @param bool $preserveWords Whether to preserve whole words
     * @return string Truncated text
     */
    public static function truncate(?string $text, int $length = 100, string $suffix = '...', bool $preserveWords = true): string
    {
        if (empty($text)) {
            return '';
        }

        // Clean text first
        $text = self::cleanText($text);

        if (strlen($text) <= $length) {
            return $text;
        }

        if ($preserveWords) {
            // Find the last space within the limit
            $lastSpace = strrpos(substr($text, 0, $length), ' ');
            if ($lastSpace !== false) {
                $text = substr($text, 0, $lastSpace);
            } else {
                $text = substr($text, 0, $length);
            }
        } else {
            $text = substr($text, 0, $length);
        }

        return $text . $suffix;
    }

    /**
     * Generate a safe slug from text
     *
     * Useful for creating URL-safe slugs from product names, category names, etc.
     *
     * @param string|null $text Text to convert to slug
     * @param string $separator Separator character (default: '-')
     * @return string Safe slug
     */
    public static function createSlug(?string $text, string $separator = '-'): string
    {
        if (empty($text)) {
            return '';
        }

        // Start with normalized text (no special chars)
        $slug = self::normalizeText($text, true);

        // Replace spaces with separator
        $slug = str_replace(' ', $separator, $slug);

        // Remove multiple separators
        $slug = preg_replace('/[' . preg_quote($separator) . ']+/', $separator, $slug);

        // Remove leading/trailing separators
        $slug = trim($slug, $separator);

        return $slug;
    }

    /**
     * Extract plain text from HTML content
     *
     * More aggressive than cleanText - specifically for extracting readable text
     * from rich HTML content like post content, product descriptions, etc.
     *
     * @param string|null $html HTML content
     * @param int|null $maxLength Maximum length to return (null for no limit)
     * @return string Plain text content
     */
    public static function extractPlainText(?string $html, ?int $maxLength = null): string
    {
        if (empty($html)) {
            return '';
        }

        // Remove script and style elements completely
        $html = preg_replace('/<(script|style)[^>]*>.*?<\/\1>/is', '', $html);

        // Convert common HTML entities to text equivalents
        $html = str_replace(['<br>', '<br/>', '<br />', '</p>', '</div>'], "\n", $html);

        // Strip all remaining HTML tags
        $text = wp_strip_all_tags($html, true);

        // Decode HTML entities
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // Normalize whitespace
        $text = preg_replace('/\s+/', ' ', $text);
        $text = trim($text);

        // Apply length limit if specified
        if ($maxLength !== null && strlen($text) > $maxLength) {
            $text = self::truncate($text, $maxLength, '...', true);
        }

        return $text;
    }

    /**
     * Convert HTML content to Markdown while stripping unsafe/irrelevant markup.
     * Falls back to plain-text extraction if conversion fails.
     *
     * @param string|null $html HTML content
     * @param bool $removeImages Whether to drop <img> tags entirely (default true)
     * @return string Markdown string
     */
    public static function htmlToMarkdown(?string $html, bool $removeImages = true): string
    {
        if (empty($html)) {
            return '';
        }

        try {
            // Remove script/style entirely before conversion
            $html = preg_replace('/<(script|style)[^>]*>.*?<\/\1>/is', '', $html);

            if ($removeImages) {
                // Remove images to avoid alt text noise from galleries/icons
                $html = preg_replace('/<img[^>]*>/i', '', $html);
            }

            $converter = new HtmlConverter([
                'strip_tags' => true,
                'remove_nodes' => 'style,script,noscript,iframe,svg,canvas',
                'hard_break' => true,
            ]);

            $markdown = $converter->convert($html ?? '');

            // Normalize whitespace and trim
            $markdown = preg_replace("/\n{3,}/", "\n\n", (string) $markdown);
            $markdown = trim((string) $markdown);

            // If conversion results in almost nothing, fallback to plain text
            if (strlen($markdown) < 5) {
                return self::extractPlainText($html);
            }

            return $markdown;
        } catch (\Throwable $e) {
            // Fallback gracefully
            return self::extractPlainText($html);
        }
    }

    /**
     * Recursively clean attribute values: strip tags, decode entities, normalize whitespace.
     * Keeps arrays as arrays and preserves scalar types where possible.
     *
     * @param mixed $value
     * @return mixed
     */
    public static function cleanAttributeValue($value)
    {
        if (is_array($value)) {
            $result = [];
            foreach ($value as $key => $v) {
                $clean = self::cleanAttributeValue($v);
                // Skip empty strings/arrays after cleaning
                if ($clean === '' || $clean === [] || $clean === null) {
                    continue;
                }
                $result[$key] = $clean;
            }
            return $result;
        }

        if (is_object($value)) {
            // Convert to array then clean
            return self::cleanAttributeValue((array) $value);
        }

        if (is_string($value)) {
            // If looks like serialized PHP (e.g., a:1:{...}), keep raw to avoid breaking meaning
            if (preg_match('/^[aObisdN]:/i', $value) === 1) {
                return $value;
            }

            // Many attributes contain HTML snippets (icons, spans) → strip to plain
            $plain = self::extractPlainText($value);
            return $plain;
        }

        // Pass through scalars
        return $value;
    }

    /**
     * Check if text contains potentially sensitive information
     *
     * Simple heuristic check for common patterns that might indicate PII
     * or sensitive data that shouldn't be indexed.
     *
     * @param string|null $text Text to check
     * @return bool True if text might contain sensitive info
     */
    public static function containsSensitiveInfo(?string $text): bool
    {
        if (empty($text)) {
            return false;
        }

        $text = strtolower($text);

        // Patterns that might indicate sensitive information
        $sensitivePatterns = [
            '/\b\d{3}-\d{2}-\d{4}\b/',           // SSN pattern
            '/\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b/', // Credit card pattern
            '/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/', // Email pattern
            '/\b\d{3}[- ]?\d{3}[- ]?\d{4}\b/',   // Phone number pattern
        ];

        foreach ($sensitivePatterns as $pattern) {
            if (preg_match($pattern, $text)) {
                return true;
            }
        }

        // Check for common sensitive keywords
        $sensitiveKeywords = [
            'password', 'credit card', 'ssn', 'social security',
            'bank account', 'routing number', 'api key', 'secret'
        ];

        foreach ($sensitiveKeywords as $keyword) {
            if (strpos($text, $keyword) !== false) {
                return true;
            }
        }

        return false;
    }

    /**
     * Sanitize text for safe database storage
     *
     * Combines cleaning with WordPress sanitization functions
     *
     * @param string|null $text Text to sanitize
     * @param string $type Type of sanitization ('text', 'textarea', 'email', 'url')
     * @return string Sanitized text
     */
    public static function sanitizeForDatabase(?string $text, string $type = 'text'): string
    {
        if (empty($text)) {
            return '';
        }

        // First clean the text
        $text = self::cleanText($text);

        // Apply WordPress sanitization based on type
        switch ($type) {
            case 'textarea':
                return sanitize_textarea_field($text);
            case 'email':
                return sanitize_email($text);
            case 'url':
                return esc_url_raw($text);
            case 'text':
            default:
                return sanitize_text_field($text);
        }
    }
}
