<?php
/**
 * core.framework
 *
 * @category  Core
 * @package   Core_Embed
 * @copyright Copyright (c) 2011. Burza d.o.o. (http://web.burza.hr/en/)
 * @license   proprietary
 */

/**
 * @category  Core
 * @package   Core_Embed
 * @copyright Copyright (c) 2011. Burza d.o.o. (http://web.burza.hr/en/)
 * @license   proprietary
 */
class Core_Embed_Parser_Markup extends Core_Embed_Parser_Abstract
{
    const LOOKUP_ATTRIBUTE = 'attribute';
    const LOOKUP_VALUE     = 'value';
    
    /**
     * @var array
     */
    protected $_mapping = array(
        // name => array(<lookup canidates>)
        'url' => array(
            array(
                'xpath'     => '//head/link[@rel="canonical"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'href',
                'prefix'    => true,
            ),
        ),
        'type' => array(
            array(
                'xpath'     => '//head/meta[@property="og:type"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'title' => array(
            array(
                'xpath'     => '//head/meta[@property="og:title"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
            array(
                'xpath'     => '//head/title',
                'lookup'    => self::LOOKUP_VALUE,
            ),
            array(
                'xpath'     => '//h1',
                'lookup'    => self::LOOKUP_VALUE,
            ),
        ),
        'description' => array(
            array(
                'xpath'     => '//head/meta[@property="og:description"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
            array(
                'xpath'     => '//head/meta[@name="description"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
            array(
                'xpath'     => '//p',
                'lookup'    => self::LOOKUP_VALUE,
            ),
        ),
        'provider_name' => array(
            array(
                'xpath'     => '//head/meta[@property="og:site_name"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'image' => array(
            array(
                'xpath'     => '//head/meta[@property="og:image"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
                'prefix'    => true,
            ),
            array(
                'xpath'     => '//head/link[@rel="image_src"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'href',
                'prefix'    => true,
            ),
        ),
        'image_type' => array(
            array(
                'xpath'     => '//head/meta[@property="og:image:type"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'image_width' => array(
            array(
                'xpath'     => '//head/meta[@property="og:image:width"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'image_height' => array(
            array(
                'xpath'     => '//head/meta[@property="og:image:height"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'video' => array(
            array(
                'xpath'     => '//head/meta[@property="og:video"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
                'prefix'    => true,
            ),
        ),
        'video_type' => array(
            array(
                'xpath'     => '//head/meta[@property="og:video:type"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'video_width' => array(
            array(
                'xpath'     => '//head/meta[@property="og:video:width"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
        'video_height' => array(
            array(
                'xpath'     => '//head/meta[@property="og:video:height"]',
                'lookup'    => self::LOOKUP_ATTRIBUTE,
                'attribute' => 'content',
            ),
        ),
    );
    
    /**
     * @param string $url
     * 
     * @return Core_Embed_Response
     */
    protected function _parse($url)
    {
        $httpClient   = $this->getHttpClient();
        $httpClient->setUri($url);
        $httpResponse = $httpClient->request(Zend_Http_Client::GET);
        $body     = $httpResponse->getBody();
        $type     = $httpResponse->getHeader('Content-Type');
        $response = new Core_Embed_Response(array(
            'type'  => Core_Embed_Response::TYPE_LINK,
            'title' => $url,
            'url'   => $url,
        ));
        
        if (empty($body)) {
            // empty body, empty response
            return $response;
        }
        
        // extracting $type, $charset
        extract($this->_detectType($type));
        $dom      = $this->_generateDom($body, $type, $charset);
        
        if (!$dom) {
            // unable to create DOM, returning an empty response
            return $response;
        }
        
        // we'll be using this xPath instance along the way
        $xpath = new DOMXPath($dom);
        foreach ($this->_mapping as $property => $queries) {
            foreach($queries as $query) {
                if (null !== ($node = $xpath->query($query['xpath'])->item(0))) {
                    $content = null;
                    switch($query['lookup']) {
                        case self::LOOKUP_ATTRIBUTE:
                            $content = $node->getAttribute($query['attribute']);
                            break;
                        case self::LOOKUP_VALUE:
                            $content = $this->_extractText($node);
                            break;
                    }
                    if (null != $content) {
                        // we need to prefix this item with the URL base
                        if (isset($query['prefix']) && $query['prefix'] && 0 === strpos($content, '/')) {
                            $tokens  = parse_url($url);
                            $content = sprintf('%1$s://%2$s%3$s', $tokens['scheme'], $tokens['host'], $content);
                        }
                        $response->set($property, $content);
                        
                        // found our property, short-circuit the search
                        break;
                    }
                }
            }
        }
                
        return $response;
    }
    
    /**
     * @param string $body    Request body
     * @param string $type    Request MIME type
     * @param string $charset Request charset
     * 
     * @return DOMDocument
     */
    protected function _generateDom($body, $type, $charset)
    {
        if (!in_array($charset, array('utf-8', 'utf8'))) {
            $body = iconv($charset, 'utf-8', $body);
        }
        
        $charset = 'utf8';
        $dom     = new DOMDocument(1.0, $charset);
        switch ($type) {
            case 'text/xml':
                $body = tidy_repair_string($body, array(
                    'wrap'      => 0,
                    'input-xml' => true,
                ), $charset);
                if ($dom->loadXML($body, LIBXML_NOERROR)) {
                    $dom = false;
                }
                break;
            case 'text/html':
                $body   = tidy_repair_string($body, array(
                    'wrap' => 0,
                ), $charset);
                $body   = mb_convert_encoding($body, 'HTML-ENTITIES', $charset); 
                $errors = libxml_use_internal_errors(true);
                $dom->loadHTML($body);
                libxml_use_internal_errors($errors);
                break;
        }
        return $dom;
    }
    
    /**
     * @param DOMNode $node
     * 
     * @return string
     */
    protected function _extractText(DOMNode $node)
    {        
        if ($node instanceof DOMText) {
            return $node->nodeValue;
        }
        
        // node is of type DOMElement
        $text = '';
        $node = $node->firstChild;
        if ($node) {
            $text .= $this->_extractText($node);
            while (null !== ($node = $node->nextSibling)) {
                $text .= $this->_extractText($node);
            }
        }
        return $text;
    }
}
