Reputation: 2291
I am trying to use the http://validator.w3.org/nu/
API for Direct Input via POST method.
https://github.com/validator/validator/wiki/Service-%C2%BB-Input-%C2%BB-textarea
This is what I tried but did not succeeded
class frontend {
public static function file_get_contents_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_HEADER, 0);
$user_agent = self::random_user_agent();
//var_dump($user_agent);
curl_setopt($ch,CURLOPT_USERAGENT,$user_agent);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
if (strpos($url, 'https') !== false) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
}
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
}
$domain = 'yahoo.com';
$url = 'https://'.$domain;
$html = frontend::file_get_contents_curl($url);
libxml_use_internal_errors(true);
$doc = new DOMDocument;
$doc->loadHTML($html);
$html_file_output = $domain.'.html';
$dir = $_SERVER['DOCUMENT_ROOT'].'/tmp/';
if(!file_exists($dir)) {
mkdir($dir);
}
$file_path = $dir.$html_file_output;
$doc->saveHTMLFile($file_path);
var_dump($file_path); // the filepath where the file is saved /www.domain.com/tmp/html_file.html
$url_validator = 'http://validator.w3.org/nu/';
$query = [
'out' => 'json',
'content' => $html // the HTML resulting from $url variable %3C%21DOCTYPE+html%3E%0....
//'content' => $file_path tried also => /www.domain.com/tmp/the_file.html
];
$query_string = http_build_query($query);
var_dump($query_string); // returns string 'out=json&content=doctype html....' or 'out=json&content=F:/SERVER/www/www.domain.com/tmp/yahoo.com.html'
$ch = curl_init();
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $query_string);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$str_html = curl_exec($ch);
curl_close($ch);
$data = json_decode($str_html);
var_dump($data); // returns null
unlink($file_path);
Upvotes: 0
Views: 185
Reputation: 21583
first off, the "direct input" api only accepts POST requests in the multipart/form-data
-format, but when you run it through http_build_query()
you convert it to application/x-www-form-urlencoded
-format, which that api doesn't understand. (give CURLOPT_POSTFIELDS an array and it's automatically converted to multipart/form-data
)
second, this API blocks requests that lack a User-Agent
header, and libcurl has no default UA (curl the cli program does, but libcurl does not), so you must supply one yourself, but you don't.
... fixing those 2, and adding some simple error message parsing,
<?php
$ch=curl_init();
$html=<<<'HTML'
<!DOCTYPE html>
<html lang="">
<head>
<title>Test</title>
</head><ERR&OR
<body>
<p></p>
</body>
</html>
HTML;
curl_setopt_array($ch,array(
CURLOPT_URL=>'http://validator.w3.org/nu/',
CURLOPT_ENCODING=>'',
CURLOPT_USERAGENT=>'PHP/'.PHP_VERSION.' libcurl/'.(curl_version()['version']),
CURLOPT_POST=>1,
CURLOPT_POSTFIELDS=>array(
'showsource'=>'yes',
'content'=>$html
),
CURLOPT_RETURNTRANSFER=>1,
));
$html=curl_exec($ch);
curl_close($ch);
$parsed=array();
$domd=@DOMDocument::loadHTML($html);
$xp=new DOMXPath($domd);
$res=$domd->getElementById("results");
foreach($xp->query("//*[@class='error']",$res) as $message){
$parsed['errors'][]=trim($message->textContent);
}
var_dump($html);
var_dump($parsed);
prints:
array(1) {
["errors"]=>
array(4) {
[0]=>
string(156) "Error: Saw < when expecting an attribute name. Probable cause: Missing > immediately before.At line 6, column 1</head><ERR&ORâ©<body>â©<p></p>â©"
[1]=>
string(254) "Error: Element err&or not allowed as child of element body in this context. (Suppressing further errors from this subtree.)From line 5, column 8; to line 6, column 6e>â©</head><ERR&ORâ©<body>â©<p></Content model for element body:Flow content."
[2]=>
string(144) "Error: End tag for body seen, but there were unclosed elements.From line 8, column 1; to line 8, column 7>â©<p></p>â©</body>â©</htm"
[3]=>
string(118) "Error: Unclosed element err&or.From line 5, column 8; to line 6, column 6e>â©</head><ERR&ORâ©<body>â©<p></"
}
}
... and the unicode issues arise from DOMDocument's default charset being.. idk, not-utf8, afaik there's no good way to set the default charset with DOMDocument, but you can hack around it by doing
$domd=@DOMDocument::loadHTML('<?xml encoding="UTF-8">'.$html);
which makes it print:
array(1) {
["errors"]=>
array(4) {
[0]=>
string(147) "Error: Saw < when expecting an attribute name. Probable cause: Missing > immediately before.At line 6, column 1</head><ERR&OR↩<body>↩<p></p>↩"
[1]=>
string(245) "Error: Element err&or not allowed as child of element body in this context. (Suppressing further errors from this subtree.)From line 5, column 8; to line 6, column 6e>↩</head><ERR&OR↩<body>↩<p></Content model for element body:Flow content."
[2]=>
string(135) "Error: End tag for body seen, but there were unclosed elements.From line 8, column 1; to line 8, column 7>↩<p></p>↩</body>↩</htm"
[3]=>
string(109) "Error: Unclosed element err&or.From line 5, column 8; to line 6, column 6e>↩</head><ERR&OR↩<body>↩<p></"
}
}
... which is better, but still contains the arrows used on the webpage, which can be removed with
foreach($xp->query("//*[@class='lf']") as $remove){
$remove->parentNode->removeChild($remove);
}
which makes it print:
array(1) {
["errors"]=>
array(4) {
[0]=>
string(138) "Error: Saw < when expecting an attribute name. Probable cause: Missing > immediately before.At line 6, column 1</head><ERR&OR<body><p></p>"
[1]=>
string(236) "Error: Element err&or not allowed as child of element body in this context. (Suppressing further errors from this subtree.)From line 5, column 8; to line 6, column 6e></head><ERR&OR<body><p></Content model for element body:Flow content."
[2]=>
string(126) "Error: End tag for body seen, but there were unclosed elements.From line 8, column 1; to line 8, column 7><p></p></body></htm"
[3]=>
string(100) "Error: Unclosed element err&or.From line 5, column 8; to line 6, column 6e></head><ERR&OR<body><p></"
}
}
Upvotes: 1