<?php
/**
 * Content extractor for document sync.
 *
 * Extracts searchable content from WordPress posts, WooCommerce products,
 * ACF fields, and page builder content.
 *
 * @package Intufind
 */

// Prevent direct access.
if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

/**
 * Unified content extractor.
 *
 * Responsible for extracting all relevant content from WordPress entities
 * for synchronization to the cloud search index.
 */
class Intufind_Content_Extractor {

	/**
	 * Maximum content length (200KB to support chunking in cloud).
	 *
	 * @var int
	 */
	const MAX_CONTENT_LENGTH = 200000;

	/**
	 * Extract content from a post/product for cloud sync.
	 *
	 * Returns a standardized document structure regardless of post type,
	 * including content from ACF fields and page builders.
	 *
	 * @param int    $post_id   Post ID.
	 * @param string $post_type Post type.
	 * @return array|null Document data or null if invalid.
	 */
	public function extract( $post_id, $post_type ) {
		// Handle WooCommerce products specially.
		if ( 'product' === $post_type && class_exists( 'WooCommerce' ) ) {
			return $this->extract_product( $post_id );
		}

		return $this->extract_post( $post_id, $post_type );
	}

	/**
	 * Extract content from a standard WordPress post.
	 *
	 * @param int    $post_id   Post ID.
	 * @param string $post_type Post type.
	 * @return array|null Document data or null if invalid.
	 */
	private function extract_post( $post_id, $post_type ) {
		$post = get_post( $post_id );
		if ( ! $post ) {
			return null;
		}

		// Build content from multiple sources.
		$content_parts = array();

		// Primary content.
		$primary_content = $this->clean_content( $post->post_content );
		if ( ! empty( $primary_content ) ) {
			$content_parts[] = $primary_content;
		}

		// ACF fields (if available).
		$acf_content = $this->extract_acf_content( $post_id );
		if ( ! empty( $acf_content ) ) {
			$content_parts[] = $acf_content;
		}

		// Page builder content.
		$builder_content = $this->extract_page_builder_content( $post_id, $post );
		if ( ! empty( $builder_content ) ) {
			$content_parts[] = $builder_content;
		}

		// Combine all content.
		$full_content = implode( "\n\n", array_filter( $content_parts ) );

		// Validate we have meaningful content.
		if ( empty( $post->post_title ) && empty( $full_content ) ) {
			return null;
		}

		// Truncate if necessary.
		if ( strlen( $full_content ) > self::MAX_CONTENT_LENGTH ) {
			$full_content = substr( $full_content, 0, self::MAX_CONTENT_LENGTH );
		}

		// Get taxonomies.
		$categories = $this->get_post_terms( $post_id, 'category' );
		$tags       = $this->get_post_terms( $post_id, 'post_tag' );

		// Get featured image.
		$image_id  = get_post_thumbnail_id( $post_id );
		$image_url = $image_id ? wp_get_attachment_url( $image_id ) : null;

		return array(
			'id'            => (string) $post_id,
			'title'         => $post->post_title,
			'content'       => $full_content,
			'excerpt'       => $this->clean_content( $post->post_excerpt ),
			'url'           => get_permalink( $post_id ),
			'postType'      => $post_type,
			'status'        => $post->post_status,
			'author'        => get_the_author_meta( 'display_name', $post->post_author ),
			'created'       => $post->post_date_gmt,
			'modified'      => $post->post_modified_gmt,
			'categories'    => $categories,
			'tags'          => $tags,
			'featuredImage' => $image_url,
			'searchable'    => Intufind_List_Columns::is_searchable( $post_id ),
			'source'        => 'wordpress',
		);
	}

	/**
	 * Extract content from a WooCommerce product.
	 *
	 * @param int $product_id Product ID.
	 * @return array|null Document data or null if invalid.
	 */
	private function extract_product( $product_id ) {
		$product = wc_get_product( $product_id );
		if ( ! $product ) {
			return null;
		}

		// Build content from multiple sources.
		$content_parts = array();

		// Product description.
		$description = $this->clean_content( $product->get_description() );
		if ( ! empty( $description ) ) {
			$content_parts[] = $description;
		}

		// ACF fields.
		$acf_content = $this->extract_acf_content( $product_id );
		if ( ! empty( $acf_content ) ) {
			$content_parts[] = $acf_content;
		}

		$full_content = implode( "\n\n", array_filter( $content_parts ) );

		// Truncate if necessary.
		if ( strlen( $full_content ) > self::MAX_CONTENT_LENGTH ) {
			$full_content = substr( $full_content, 0, self::MAX_CONTENT_LENGTH );
		}

		// Get categories.
		$categories = array();
		foreach ( $product->get_category_ids() as $cat_id ) {
			$term = get_term( $cat_id, 'product_cat' );
			if ( $term && ! is_wp_error( $term ) ) {
				$categories[] = $term->name;
			}
		}

		// Get tags.
		$tags = array();
		foreach ( $product->get_tag_ids() as $tag_id ) {
			$term = get_term( $tag_id, 'product_tag' );
			if ( $term && ! is_wp_error( $term ) ) {
				$tags[] = $term->name;
			}
		}

		// Get brands (if taxonomy exists).
		$brands = array();
		if ( taxonomy_exists( 'product_brand' ) ) {
			$brand_terms = wp_get_post_terms( $product_id, 'product_brand' );
			if ( ! is_wp_error( $brand_terms ) ) {
				foreach ( $brand_terms as $term ) {
					$brands[] = $term->name;
				}
			}
		}

		// Get product attributes.
		$attributes = $this->extract_product_attributes( $product );

		// Get variations summary for variable products.
		$variations = array();
		if ( $product->is_type( 'variable' ) ) {
			$variations = $this->extract_variation_summary( $product );
		}

		// Get image URL.
		$image_id  = $product->get_image_id();
		$image_url = $image_id ? wp_get_attachment_url( $image_id ) : null;

		// Get gallery images.
		$gallery_urls = array();
		foreach ( $product->get_gallery_image_ids() as $gallery_id ) {
			$url = wp_get_attachment_url( $gallery_id );
			if ( $url ) {
				$gallery_urls[] = $url;
			}
		}

		return array(
			'id'              => (string) $product_id,
			'name'            => $product->get_name(),
			'content'         => $full_content,
			'excerpt'         => $this->clean_content( $product->get_short_description() ),
			'url'             => $product->get_permalink(),
			'postType'        => 'product',
			'status'          => $product->get_status(),
			'created'         => $product->get_date_created() ? $product->get_date_created()->date( 'Y-m-d H:i:s' ) : null,
			'modified'        => $product->get_date_modified() ? $product->get_date_modified()->date( 'Y-m-d H:i:s' ) : null,

			// Pricing.
			'price'           => (float) $product->get_price(),
			'regularPrice'    => (float) $product->get_regular_price(),
			'salePrice'       => $product->get_sale_price() ? (float) $product->get_sale_price() : null,
			'onSale'          => $product->is_on_sale(),

			// Inventory.
			'sku'             => $product->get_sku(),
			'stockStatus'     => $product->get_stock_status(),
			'stockQuantity'   => $product->get_stock_quantity(),
			'manageStock'     => $product->get_manage_stock(),

			// Product details (cast to float, empty/null values become null, '0' is valid).
			'weight'          => $this->parse_numeric_field( $product->get_weight() ),
			'dimensions'      => array(
				'length' => $this->parse_numeric_field( $product->get_length() ),
				'width'  => $this->parse_numeric_field( $product->get_width() ),
				'height' => $this->parse_numeric_field( $product->get_height() ),
			),
			'productType'     => $product->get_type(),
			'featured'        => $product->is_featured(),
			'virtual'         => $product->is_virtual(),
			'downloadable'    => $product->is_downloadable(),

			// Taxonomy.
			'categories'      => $categories,
			'tags'            => $tags,
			'brands'          => $brands,

			// Attributes & variations.
			'attributes'      => $attributes,
			'variations'      => $variations,

			// Media.
			'image'           => $image_url,
			'gallery'         => $gallery_urls,

			// Visibility.
			'catalogVisibility' => $product->get_catalog_visibility(),
			'searchable'      => Intufind_List_Columns::is_searchable( $product_id ) && 'hidden' !== $product->get_catalog_visibility(),

			// Source tracking.
			'source'          => 'wordpress',
		);
	}

	/**
	 * Extract ACF fields content.
	 *
	 * @param int $post_id Post ID.
	 * @return string Combined searchable text from ACF fields.
	 */
	private function extract_acf_content( $post_id ) {
		if ( ! function_exists( 'get_fields' ) ) {
			return '';
		}

		$fields = get_fields( $post_id );
		if ( empty( $fields ) || ! is_array( $fields ) ) {
			return '';
		}

		$text_parts = array();

		foreach ( $fields as $field_name => $field_value ) {
			$extracted = $this->extract_acf_field_value( $field_name, $field_value, $post_id );
			if ( ! empty( $extracted ) ) {
				$text_parts[] = $extracted;
			}
		}

		return implode( ' ', array_filter( $text_parts ) );
	}

	/**
	 * Extract searchable text from an ACF field value.
	 *
	 * @param string $field_name  Field name.
	 * @param mixed  $field_value Field value.
	 * @param int    $post_id     Post ID.
	 * @return string Extracted text.
	 */
	private function extract_acf_field_value( $field_name, $field_value, $post_id ) {
		if ( empty( $field_value ) ) {
			return '';
		}

		// Get field type if available.
		$field_object = function_exists( 'get_field_object' ) ? get_field_object( $field_name, $post_id ) : null;
		$field_type   = $field_object['type'] ?? 'text';

		switch ( $field_type ) {
			case 'text':
			case 'email':
			case 'url':
			case 'textarea':
				return $this->clean_content( (string) $field_value );

			case 'wysiwyg':
				return $this->clean_content( $field_value );

			case 'select':
			case 'radio':
			case 'button_group':
				return is_array( $field_value ) ? implode( ' ', $field_value ) : (string) $field_value;

			case 'checkbox':
				return is_array( $field_value ) ? implode( ' ', $field_value ) : '';

			case 'repeater':
			case 'flexible_content':
				return $this->extract_acf_repeater( $field_value );

			case 'group':
				if ( is_array( $field_value ) ) {
					$parts = array();
					foreach ( $field_value as $sub_name => $sub_value ) {
						$parts[] = $this->extract_acf_field_value( $sub_name, $sub_value, $post_id );
					}
					return implode( ' ', array_filter( $parts ) );
				}
				return '';

			case 'relationship':
			case 'post_object':
				return $this->extract_acf_relationship( $field_value );

			case 'taxonomy':
				return $this->extract_acf_taxonomy( $field_value );

			case 'image':
				// Extract alt text if available.
				if ( is_array( $field_value ) && ! empty( $field_value['alt'] ) ) {
					return $field_value['alt'];
				}
				return '';

			default:
				// Try to extract string value.
				if ( is_string( $field_value ) || is_numeric( $field_value ) ) {
					return $this->clean_content( (string) $field_value );
				}
				return '';
		}
	}

	/**
	 * Extract text from ACF repeater/flexible content.
	 *
	 * @param mixed $field_value Repeater value.
	 * @return string Extracted text.
	 */
	private function extract_acf_repeater( $field_value ) {
		if ( ! is_array( $field_value ) ) {
			return '';
		}

		$parts = array();
		foreach ( $field_value as $row ) {
			if ( is_array( $row ) ) {
				foreach ( $row as $sub_value ) {
					if ( is_string( $sub_value ) || is_numeric( $sub_value ) ) {
						$cleaned = $this->clean_content( (string) $sub_value );
						if ( ! empty( $cleaned ) ) {
							$parts[] = $cleaned;
						}
					}
				}
			}
		}

		return implode( ' ', $parts );
	}

	/**
	 * Extract text from ACF relationship field.
	 *
	 * @param mixed $field_value Relationship value.
	 * @return string Post titles.
	 */
	private function extract_acf_relationship( $field_value ) {
		if ( ! is_array( $field_value ) ) {
			$field_value = array( $field_value );
		}

		$titles = array();
		foreach ( $field_value as $item ) {
			if ( is_numeric( $item ) ) {
				$post = get_post( $item );
				if ( $post ) {
					$titles[] = $post->post_title;
				}
			} elseif ( is_object( $item ) && isset( $item->post_title ) ) {
				$titles[] = $item->post_title;
			}
		}

		return implode( ' ', $titles );
	}

	/**
	 * Extract text from ACF taxonomy field.
	 *
	 * @param mixed $field_value Taxonomy value.
	 * @return string Term names.
	 */
	private function extract_acf_taxonomy( $field_value ) {
		if ( ! is_array( $field_value ) ) {
			$field_value = array( $field_value );
		}

		$names = array();
		foreach ( $field_value as $item ) {
			if ( is_numeric( $item ) ) {
				$term = get_term( $item );
				if ( $term && ! is_wp_error( $term ) ) {
					$names[] = $term->name;
				}
			} elseif ( is_object( $item ) && isset( $item->name ) ) {
				$names[] = $item->name;
			}
		}

		return implode( ' ', $names );
	}

	/**
	 * Extract page builder content.
	 *
	 * Attempts to extract rendered content from various page builders.
	 *
	 * @param int      $post_id Post ID.
	 * @param \WP_Post $post    Post object.
	 * @return string Extracted content.
	 */
	private function extract_page_builder_content( $post_id, $post ) {
		// Elementor.
		if ( $this->is_elementor_post( $post_id ) ) {
			return $this->extract_elementor_content( $post_id );
		}

		// Beaver Builder.
		if ( $this->is_beaver_builder_post( $post_id ) ) {
			return $this->extract_beaver_builder_content( $post_id );
		}

		// Divi.
		if ( $this->is_divi_post( $post ) ) {
			return $this->extract_divi_content( $post );
		}

		return '';
	}

	/**
	 * Check if post uses Elementor.
	 *
	 * @param int $post_id Post ID.
	 * @return bool
	 */
	private function is_elementor_post( $post_id ) {
		return class_exists( '\Elementor\Plugin' ) && get_post_meta( $post_id, '_elementor_edit_mode', true ) === 'builder';
	}

	/**
	 * Extract Elementor content.
	 *
	 * @param int $post_id Post ID.
	 * @return string Extracted content.
	 */
	private function extract_elementor_content( $post_id ) {
		if ( ! class_exists( '\Elementor\Plugin' ) ) {
			return '';
		}

		// Get Elementor data.
		$data = get_post_meta( $post_id, '_elementor_data', true );
		if ( empty( $data ) ) {
			return '';
		}

		$elements = is_string( $data ) ? json_decode( $data, true ) : $data;
		if ( ! is_array( $elements ) ) {
			return '';
		}

		return $this->extract_elementor_text_recursive( $elements );
	}

	/**
	 * Recursively extract text from Elementor elements.
	 *
	 * @param array $elements Elementor elements.
	 * @return string Extracted text.
	 */
	private function extract_elementor_text_recursive( $elements ) {
		$text_parts = array();

		foreach ( $elements as $element ) {
			// Extract text from settings.
			if ( ! empty( $element['settings'] ) ) {
				foreach ( $element['settings'] as $key => $value ) {
					if ( is_string( $value ) && in_array( $key, array( 'title', 'editor', 'text', 'description', 'content', 'heading' ), true ) ) {
						$cleaned = $this->clean_content( $value );
						if ( ! empty( $cleaned ) ) {
							$text_parts[] = $cleaned;
						}
					}
				}
			}

			// Recurse into child elements.
			if ( ! empty( $element['elements'] ) ) {
				$child_text = $this->extract_elementor_text_recursive( $element['elements'] );
				if ( ! empty( $child_text ) ) {
					$text_parts[] = $child_text;
				}
			}
		}

		return implode( ' ', $text_parts );
	}

	/**
	 * Check if post uses Beaver Builder.
	 *
	 * @param int $post_id Post ID.
	 * @return bool
	 */
	private function is_beaver_builder_post( $post_id ) {
		return class_exists( 'FLBuilderModel' ) && get_post_meta( $post_id, '_fl_builder_enabled', true );
	}

	/**
	 * Extract Beaver Builder content.
	 *
	 * @param int $post_id Post ID.
	 * @return string Extracted content.
	 */
	private function extract_beaver_builder_content( $post_id ) {
		$data = get_post_meta( $post_id, '_fl_builder_data', true );
		if ( empty( $data ) || ! is_array( $data ) ) {
			return '';
		}

		$text_parts = array();
		foreach ( $data as $node ) {
			if ( ! empty( $node->settings ) ) {
				$settings = (array) $node->settings;
				foreach ( $settings as $key => $value ) {
					if ( is_string( $value ) && in_array( $key, array( 'text', 'heading', 'content', 'title', 'html' ), true ) ) {
						$cleaned = $this->clean_content( $value );
						if ( ! empty( $cleaned ) ) {
							$text_parts[] = $cleaned;
						}
					}
				}
			}
		}

		return implode( ' ', $text_parts );
	}

	/**
	 * Check if post uses Divi.
	 *
	 * @param \WP_Post $post Post object.
	 * @return bool
	 */
	private function is_divi_post( $post ) {
		return ( defined( 'ET_BUILDER_VERSION' ) || function_exists( 'et_divi_fonts_url' ) )
			&& strpos( $post->post_content, '[et_pb_' ) !== false;
	}

	/**
	 * Extract Divi content.
	 *
	 * @param \WP_Post $post Post object.
	 * @return string Extracted content.
	 */
	private function extract_divi_content( $post ) {
		// Divi content is in shortcodes - clean the content to get text.
		$content = $post->post_content;

		// Remove Divi shortcode tags but keep content.
		$content = preg_replace( '/\[et_pb_[^\]]*\]/', '', $content );
		$content = preg_replace( '/\[\/et_pb_[^\]]*\]/', '', $content );

		return $this->clean_content( $content );
	}

	/**
	 * Extract product attributes.
	 *
	 * @param \WC_Product $product Product object.
	 * @return array|null Attributes with values, or null if none.
	 */
	private function extract_product_attributes( $product ) {
		$result = array();

		foreach ( $product->get_attributes() as $attribute ) {
			if ( ! is_object( $attribute ) || ! method_exists( $attribute, 'get_visible' ) ) {
				continue;
			}

			if ( ! $attribute->get_visible() ) {
				continue;
			}

			$name = $attribute->get_name();

			if ( $attribute->is_taxonomy() ) {
				// Taxonomy-based attribute.
				$terms = $attribute->get_terms();
				if ( ! empty( $terms ) && ! is_wp_error( $terms ) ) {
					$values = array();
					foreach ( $terms as $term ) {
						$values[] = $term->name;
					}
					$result[ wc_attribute_label( $name ) ] = $values;
				}
			} else {
				// Custom attribute.
				$options = $attribute->get_options();
				if ( ! empty( $options ) ) {
					$result[ $attribute->get_name() ] = $options;
				}
			}
		}

		// Return null for empty to avoid JSON encoding as [] (array) instead of {} (object).
		return ! empty( $result ) ? $result : null;
	}

	/**
	 * Extract variation summary for variable products.
	 *
	 * @param \WC_Product_Variable $product Variable product.
	 * @return array Variation summary.
	 */
	private function extract_variation_summary( $product ) {
		$variations = array();

		// Get price range.
		$min_price = $product->get_variation_price( 'min' );
		$max_price = $product->get_variation_price( 'max' );

		$variations['priceRange'] = array(
			'min' => (float) $min_price,
			'max' => (float) $max_price,
		);

		// Get available variation attributes.
		$variation_attributes = $product->get_variation_attributes();
		foreach ( $variation_attributes as $attribute => $values ) {
			$label                                   = wc_attribute_label( $attribute );
			$variations['availableOptions'][ $label ] = array_values( array_filter( $values ) );
		}

		return $variations;
	}

	/**
	 * Get post terms as array of names.
	 *
	 * @param int    $post_id  Post ID.
	 * @param string $taxonomy Taxonomy name.
	 * @return array Term names.
	 */
	private function get_post_terms( $post_id, $taxonomy ) {
		$terms = wp_get_post_terms( $post_id, $taxonomy );
		if ( is_wp_error( $terms ) || empty( $terms ) ) {
			return array();
		}

		return array_map(
			function ( $term ) {
				return $term->name;
			},
			$terms
		);
	}

	/**
	 * Parse a numeric field value, handling empty strings, null, and valid zero values.
	 *
	 * WooCommerce returns empty strings for unset numeric fields, but '0' and '0.0'
	 * are valid values that should be preserved as 0.0.
	 *
	 * @param mixed $value The value to parse (string, null, or numeric).
	 * @return float|null The parsed float value, or null if empty/invalid.
	 */
	private function parse_numeric_field( $value ) {
		// Handle null and empty string.
		if ( null === $value || '' === $value ) {
			return null;
		}

		// Handle numeric strings and numbers (including '0' and '0.0').
		if ( is_numeric( $value ) ) {
			return (float) $value;
		}

		// Invalid non-numeric value.
		return null;
	}

	/**
	 * Clean content by stripping HTML, shortcodes, and extra whitespace.
	 *
	 * @param string $content Raw content.
	 * @return string Cleaned content.
	 */
	private function clean_content( $content ) {
		if ( empty( $content ) ) {
			return '';
		}

		// Strip shortcodes.
		$content = strip_shortcodes( $content );

		// Strip HTML tags.
		$content = wp_strip_all_tags( $content );

		// Decode HTML entities.
		$content = html_entity_decode( $content, ENT_QUOTES | ENT_HTML5, 'UTF-8' );

		// Normalize whitespace.
		$content = preg_replace( '/\s+/', ' ', $content );

		return trim( $content );
	}

	/**
	 * Generate a content hash for change detection.
	 *
	 * @param int    $post_id   Post ID.
	 * @param string $post_type Post type.
	 * @return string Content hash.
	 */
	public function generate_content_hash( $post_id, $post_type ) {
		$document = $this->extract( $post_id, $post_type );
		if ( ! $document ) {
			return '';
		}

		// Include key fields that would require re-sync.
		$hash_data = array(
			'title'    => $document['title'] ?? '',
			'content'  => $document['content'] ?? '',
			'excerpt'  => $document['excerpt'] ?? '',
			'status'   => $document['status'] ?? '',
			'modified' => $document['modified'] ?? '',
		);

		// For products, include pricing and inventory.
		if ( 'product' === $post_type ) {
			$hash_data['price']       = $document['price'] ?? 0;
			$hash_data['salePrice']   = $document['salePrice'] ?? null;
			$hash_data['stockStatus'] = $document['stockStatus'] ?? '';
			$hash_data['onSale']      = $document['onSale'] ?? false;
			$hash_data['categories']  = $document['categories'] ?? array();
			$hash_data['attributes']  = $document['attributes'] ?? array();
		}

		return md5( wp_json_encode( $hash_data ) );
	}
}
