max
max

Reputation: 3716

extracting and printing an html element by it's id using DOMDocument

i want to extract couple of tables from a web page and show them in my page

i was going to use regex to extract them but then i saw the DOMDocument class and it seems cleaner i've looked in stackoverflow and it seems all the questions are about getting inner text or using a loop to get inner nodes of elements . i want to now how can i extract and print a html element by it's id .

$html = file_get_contents("www.site.com");
$xml = new DOMDocument();
$xml->loadHTML($html);
$xpath = new DOMXPath($xml);
$table =$xpath->query("//*[@id='myid']");
$table->saveHTML(); // this obviously doesn't work

how can i show or echo the $table as an actual html table on my page ?

Upvotes: 2

Views: 3643

Answers (2)

nathan
nathan

Reputation: 5452

You can use DOMElement::C14N() to get the canonicalized HTML(XML) representation of a DOMElement, or if you like a bit more control so that you can filter certain elements and attributes you can use something like this:

function toHTML($nodeList, $tagsToStrip=array('script','object','noscript','form','style'),$attributesToSkip=array('on*')) {
$html = '';
foreach($nodeList as $subIndex => $values) {
    if(!in_array(strtolower($values->nodeName), $tagsToStrip)) {
        if(substr($values->nodeName,0,1) != '#') {
            $html .= ' <'.$values->nodeName;
            if($values->attributes) {
                for($i=0;$values->attributes->item($i);$i++) {
                    if( !in_array( strtolower($values->attributes->item($i)->nodeName) , $attributesToSkip ) && (in_array('on*',$attributesToSkip) && substr( strtolower($values->attributes->item($i)->nodeName) ,0 , 2) != 'on') ) {
                        $vvv = $values->attributes->item($i)->nodeValue;
                        if( in_array( strtolower($values->attributes->item($i)->nodeName) , array('src','href') ) ) {
                            $vvv = resolve_href( $this->url , $vvv );
                        }
                        $html .= ' '.$values->attributes->item($i)->nodeName.'="'.$vvv.'"';
                    }
                }
            }
            if(in_array(strtolower($values->nodeName), array('br','img'))) {
                $html .= ' />';
            } else {
                $html .= '> ';
                if(!$values->firstChild) {
                    $html .= htmlspecialchars( $values->textContent , ENT_COMPAT , 'UTF-8' , true );
                } else {
                    $html .= toHTML($values->childNodes,$tagsToStrip,$attributesToSkip);
                }
                $html .= ' </'.$values->nodeName.'> '; 
            }
        } elseif(substr($values->nodeName,1,1) == 't') {
            $inner = htmlspecialchars( $values->textContent , ENT_COMPAT , 'UTF-8' , true );
            $html .= $inner;
        }
    }
}
return $html;
}

echo toHTML($table);

Upvotes: 2

DaveRandom
DaveRandom

Reputation: 88657

Firstly, DOMDocument has a getElementById() method so your XPath is unnecessary - although I suspect that is how it works underneath.

Secondly, in order to get fragments of markup rather than a whole document, you use DOMNode::C41N(), so your code would look like this:

<?php

    // Load the HTML into a DOMDocument
    // Don't forget you could just pass the URL to loadHTML()
    $html = file_get_contents("www.site.com");
    $dom = new DOMDocument('1.0');
    $dom->loadHTML($html);

    // Get the target element
    $element = $dom->getElementById('myid');

    // Get the HTML as a string
    $string = $element->C14N();

See a working example.

Upvotes: 8

Related Questions