Reputation: 561
I have over 500 pages (static) containing content structures this way,
<section>
Some text
<strong>Dynamic Title (Different on each page)</strong>
<strong>Author name (Different on each page)</strong>
<strong>Category</strong>
(<b>Content</b> <b>MORE TEXT HERE)</b>
</section>
And I need to extract the data as formatted below, using PHP Simple HTML DOM Parser
$title = <strong>Dynamic Title (Different on each page)</strong>
$authot = <strong>Author name (Different on each page)</strong>
$category = <strong>Category</strong>
$content = (<b>Content</b> <b>MORE TEXT HERE</b>)
I have failed so far and can't get my head around it, appreciate any advice or code snippet to help me going on.
EDIT 1, I have now solved the part with strong tags using,
$html = file_get_html($url);
$links = array();
foreach($html->find('strong') as $a) {
$content[] = $a->innertext;
}
$title= $content[0];
$author= $content[1];
the only remaining issue is --> How to extract content within parentheses? using similar method?
Upvotes: 0
Views: 2288
Reputation: 561
My final code that works now looks like this.
$html = file_get_html($url);
$links = array();
foreach($html->find('strong') as $a) {
$content[] = $a->innertext;
}
$title= $content[0];
$author= $content[1];
$category = $content[2];
$details = file_get_html($url)->plaintext;
$input = $details;
preg_match_all("/\(.*?\)/", $input, $matches);
print_r($matches[0]);
Upvotes: 0
Reputation: 160
OK first you want to get all of the tags Then you want to search through those again for the tags and tags Something like this:
// Create DOM from URL or file
$html = file_get_html('http://www.example.com/');
$strong = array();
// Find all <sections>
foreach($html->find('section') as $element) {
$section = $element->src;
// get <strong> tags from <section>
foreach($section->find('strong') as $strong) {
$strong[] = $strong->src;
}
$title = $strong[0];
$authot = $strong[1];
$category = $strong[2];
}
To get the parts in parentheses - just get the b tag text and then add the () brackets. Or if you're asking how to get parts in between the brackets - use explode then remove the closing bracket:
$pieces = explode("(", $title);
$different_on_each_page = str_replace(")","",$pieces[1]);
Upvotes: 2
Reputation: 462
$html_code = 'html';
$dom = new \DOMDocument();
$dom->LoadHTML($html_code);
$xpath = new \DOMXPath($this->dom);
$nodelist = $xpath->query("//strong");
for($i = 0; $i < $nodelist->length; $i++){
$nodelist->item($i)->nodeValue; //gives you the text inside
}
Upvotes: 0