<?php

namespace Drupal\ai_interpolator_scraping_bot;

use Drupal\ai_interpolator\Annotation\AiInterpolatorFieldRule;
use Drupal\ai_interpolator\PluginInterfaces\AiInterpolatorFieldRuleInterface;
use Drupal\Core\Entity\ContentEntityInterface;
use Drupal\Core\Field\FieldDefinitionInterface;
use Drupal\Core\Plugin\ContainerFactoryPluginInterface;
use ivan_boring\Readability\Configuration;
use ivan_boring\Readability\Readability;
use Symfony\Component\DependencyInjection\ContainerInterface;

/**
 * Helper class for the similar text crawlers.
 */
class LongCrawlerRule extends AiInterpolatorFieldRule implements AiInterpolatorFieldRuleInterface, ContainerFactoryPluginInterface {

  /**
   * ScrapingBot API Caller.
   */
  protected ScrapingBot $scrapingBot;

  /**
   * Construct a boolean field.
   *
   * @param array $configuration
   *   Inherited configuration.
   * @param string $plugin_id
   *   Inherited plugin id.
   * @param mixed $plugin_definition
   *   Inherited plugin definition.
   * @param \Drupal\ai_interpolator_scraping_bot\ScrapingBot $scrapingBot
   *   The ScrapingBot requester.
   */
  public function __construct(array $configuration, $plugin_id, $plugin_definition, ScrapingBot $scrapingBot) {
    parent::__construct($configuration, $plugin_id, $plugin_definition);
    $this->scrapingBot = $scrapingBot;
  }

  /**
   * {@inheritDoc}
   */
  public static function create(ContainerInterface $container, array $configuration, $plugin_id, $plugin_definition) {
    return new static(
      $configuration,
      $plugin_id,
      $plugin_definition,
      $container->get('ai_interpolator_scraping_bot.api')
    );
  }

  /**
   * {@inheritDoc}
   */
  public $title = 'ScrapingBot Crawler';

  /**
   * {@inheritDoc}
   */
  public function needsPrompt() {
    return FALSE;
  }

  /**
   * {@inheritDoc}
   */
  public function advancedMode() {
    return FALSE;
  }

  /**
   * {@inheritDoc}
   */
  public function placeholderText() {
    return "";
  }

  /**
   * {@inheritDoc}
   */
  public function allowedInputs() {
    return ['link'];
  }

  /**
   * {@inheritDoc}
   */
  public function extraAdvancedFormFields(ContentEntityInterface $entity, FieldDefinitionInterface $fieldDefinition) {
    $form['interpolator_use_chrome'] = [
      '#type' => 'checkbox',
      '#title' => $this->t('Use Chrome'),
      '#description' => $this->t("Use Chrome when scraping"),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_use_chrome', TRUE),
      '#weight' => -15,
    ];

    $form['interpolator_wait_for_network'] = [
      '#type' => 'checkbox',
      '#title' => $this->t('Wait for network'),
      '#description' => $this->t("Check if you want to wait for most ajax requests to finish until returning the Html content. This can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example."),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_wait_for_network', FALSE),
      '#weight' => -14,
    ];

    $form['interpolator_proxy_country'] = [
      '#type' => 'select',
      '#options' => ScrapingBot::$proxyCountries,
      '#title' => $this->t('Proxy Country'),
      '#description' => $this->t("Where should the scraping take place."),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_proxy_country', 'US'),
      '#weight' => -13,
    ];

    $form['interpolator_use_premium_proxy'] = [
      '#type' => 'checkbox',
      '#title' => $this->t('Use Premium Proxy'),
      '#description' => $this->t("Use Premium Proxy when scraping. This is VERY expensive."),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_use_premium_proxy', FALSE),
      '#weight' => -12,
    ];

    $form['interpolator_crawler_mode'] = [
      '#type' => 'select',
      '#options' => [
        'all' => $this->t('Raw Dump'),
        'readibility' => $this->t('Article Segmentation (Readability)'),
        'selector' => $this->t('HTML Selector'),
      ],
      '#attributes' => [
        'name' => 'interpolator_crawler_mode',
      ],
      '#required' => TRUE,
      '#title' => $this->t('Crawler Mode'),
      '#description' => $this->t("Choose the mode to fetch the page. The options are:<ul>
      <li><strong>Raw Dump</strong> - This fetches the whole body.</li>
      <li><strong>Article Segmentation (Readability)</strong> - This uses the Readability segmentation algorithm of trying to figure out the main content.</li>
      <li><strong>HTML Selector</strong> - Use a tag type and optionally class or id to fetch parts. Can also remove tags.</li></ul>"),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_crawler_mode', 'readibility'),
      '#weight' => -10,
    ];

    $form['interpolator_crawler_tag'] = [
      '#type' => 'textfield',
      '#required' => TRUE,
      '#title' => $this->t('Tag to get'),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_crawler_tag', 'body'),
      '#weight' => -10,
      '#states' => [
        'visible' => [
          ':input[name="interpolator_crawler_mode"]' => [
            'value' => 'selector',
          ],
        ],
      ],
    ];

    $form['interpolator_crawler_remove'] = [
      '#type' => 'textarea',
      '#title' => $this->t('Tags to remove'),
      '#description' => $this->t('These are tags that are just garbage and can be removed. One per line.'),
      '#default_value' => $fieldDefinition->getConfig($entity->bundle())->getThirdPartySetting('ai_interpolator', 'interpolator_crawler_remove', "style\nscript\n"),
      '#weight' => -10,
      '#states' => [
        'visible' => [
          ':input[name="interpolator_crawler_mode"]' => [
            'value' => 'selector',
          ],
        ],
      ],
    ];

    return $form;
  }

  /**
   * {@inheritDoc}
   */
  public function generate(ContentEntityInterface $entity, FieldDefinitionInterface $fieldDefinition, array $interpolatorConfig) {
    // Set options.
    $options['useChrome'] = $interpolatorConfig['use_chrome'];
    $options['waitForNetworkRequests'] = $interpolatorConfig['wait_for_network'];
    $options['proxyCountry'] = $interpolatorConfig['proxy_country'];
    $options['premiumProxy'] = $interpolatorConfig['use_premium_proxy'];

    $uris = $entity->get($interpolatorConfig['base_field'])->getValue();
    // Scrape.
    $values = [];
    foreach ($uris as $uri) {
      try {
        $rawHtml = $this->scrapingBot->scrapeRaw($uri['uri'], $options);

        // Return depending on crawler mode.
        switch ($interpolatorConfig['crawler_mode']) {
          case 'all':
            $values[] = $rawHtml;
            break;

          case 'readibility':
            $readability = new Readability(new Configuration());
            $done = $readability->parse($rawHtml);
            $values[] = $done ? $readability->getContent() : 'No scrape';
            break;

          case 'selector':
            $values[] = $this->getPartial($rawHtml, $interpolatorConfig['openai_crawler_tag'], $interpolatorConfig['openai_crawler_remove']);
            break;
        }
      }
      catch (\Exception $e) {
        $values[] = 'No HTML found';
      }
    }

    return $values;
  }

  /**
   * {@inheritDoc}
   */
  public function verifyValue(ContentEntityInterface $entity, $value, FieldDefinitionInterface $fieldDefinition) {
    // Should be a string.
    if (!is_string($value)) {
      return FALSE;
    }
    // Otherwise it is ok.
    return TRUE;
  }

  /**
   * {@inheritDoc}
   */
  public function storeValues(ContentEntityInterface $entity, array $values, FieldDefinitionInterface $fieldDefinition) {
    // Then set the value.
    $entity->set($fieldDefinition->getName(), $values);
  }

  /**
   * Crude simple DOM traverser.
   *
   * @todo Use an actual traversal library.
   *
   * @var string $html
   *   The html.
   * @var string $tag
   *   The tag to get.
   * @var string $remove
   *   The tags to remove.
   *
   * @return string
   *   The cut out html.
   */
  public function getPartial($html, $tag = 'body', $remove = "") {
    // Get the whole HTML.
    $dom = new \DOMDocument();
    $dom->loadHTML($html);
    // Get the new one.
    $mock = new \DOMDocument();

    // Figure out if classes or id is involved.
    $parts = explode('.', $tag);
    $tag = isset($parts[1]) ? $parts[0] : $tag;
    $class = $parts[1] ?? '';
    $parts = explode('#', $tag);
    $tag = isset($parts[1]) ? $parts[0] : $tag;
    $id = $parts[1] ?? '';

    // Remove.
    foreach (explode("\n", $remove) as $tagRemove) {
      $removals = $dom->getElementsByTagName($tagRemove);
      for ($t = 0; $t < $removals->count(); $t++) {
        $dom->removeChild($removals->item($t));
      }
    }

    // Get the rest.
    $tags = $dom->getElementsByTagName($tag);

    for ($t = 0; $t < $tags->count(); $t++) {
      /** @var DOMNode */
      $tag = $tags->item($t);
      if ($class && $tag->getAttribute('class') != $class) {
        continue;
      }
      if ($id && $tag->getAttribute('id') != $id) {
        continue;
      }
      foreach ($tag->childNodes as $child) {
        $mock->appendChild($mock->importNode($child, TRUE));
      }
    }
    return $mock->saveHTML();
  }

}
