Reputation: 4171
I was trying to crawl a website which has badly formatted HTML web pages. Take some web page as an example:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="keywords" content="physician, doctor, CME, CE, physician job recuiter, Chinese speaking doctors, medical school alumni," />
<meta name="description" content="An online database for physicians who obtained medical degree in China and are practicing in USA. It contains over 6,000 profiles. Email:[email protected] for any question." />
<title>
CMG Physician Database - 华人医生数据库
</title>
<link rel="stylesheet" type="text/css" href="default.css" />
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-27283808-3']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
<script type="text/javascript">
function validateForm()
{
valid = true;
if ( document.myForm.last.value == "" )
{
alert ( "Last name is required" );
valid = false;
}
else if ( document.myForm.first.value == "" )
{
alert ( "First name is required" );
valid = false;
}
else if ( document.myForm.states.value == "" )
{
alert ( "State is required" );
valid = false;
}
else if ( document.myForm.specialty_id.value == "" )
{
alert ( "Specialty is required" );
valid = false;
}
else if ( document.myForm.gradschool_id.value == "" )
{
alert ( "School is required" );
valid = false;
}
return valid;
}
</script>
<script type="text/javascript">
if (screen.width<800) {
window.location="http://physician.cmgforum.net/m/";
}
</script>
<script language="JavaScript"
type="text/JavaScript">
function changePage(newLoc)
{
nextPage = newLoc.options[newLoc.selectedIndex].value
if (nextPage != "")
{
document.location.href = nextPage
}
}
</script>
</head>
<body>
<div id="outer">
<div id="upbg"></div>
<div id="inner">
<div id="header">
<embed src="flash/cmglogo.swf" width="685" Height="90">
</div>
<div id="menu">
<center>
<ul>
<li><a href="/index.php">Home</a></li>
<li><a href="/search4-d.php">Combo Search</a></li>
<li><a href="/search-d.php">Name</a></li>
<li><a href="/gradsch-d.php">Schools</a></li>
<li><a href="/specialty-d.php">Specialties</a></li>
<li><a href="/loc-spe.php">Local Specialties</a></li>
<li><a href="/location-d.php">States</a></li>
<li><a href="/locations3-d.php">Cities</a></li>
</center>
</div>
<table width="100%">
<tr>
<td width="2%"></td>
<td>
<form method="POST" name="menu" >
<select name="selectedPage"
onChange="changePage(this.form.selectedPage)">
<option value = "" selected>Site Navigation</option>
<option value = "/index.php">Home</option>
<option value = "/facilities.php"> Medical Facilities</option>
<option value = "/stats-d.php">Statistics</option>
<option value = "/contact.php">Contacts</option>
<option value = "/top5.php">The Top5</option>
<option value = "http://blog.cmgforum.net">Blog</option>
<option value = "/links.php">Links</option>
<option value = "/addme.php">Add Me</option>
<option value = "/news.php">News</option>
<option value = "/faq.php">FAQ</option>
<option value = "/pop-d.php">人气</option>
<option value = "/pgy-d.php">PGY</option>
<option value = "/video2-d.php">CMG Videos</option>
<option value = "/url.php">CMG Website</option>
</select>
</form>
</td>
<td>
<script type="text/javascript"><!--
google_ad_client = "ca-pub-6867265085889194";
/* header-links */
google_ad_slot = "9503010667";
google_ad_width = 468;
google_ad_height = 15;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</td>
</tr></table>
</body>
</html>
<center>
<table><tr>
<td width="5%"></td>
<td>
<center>
<script type="text/javascript"><!--
google_ad_client = "pub-6867265085889194";
/* 468x60, created 8/19/10 */
google_ad_slot = "1699885909";
google_ad_width = 400;
google_ad_height = 60;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</center>
</td>
<td>
<img src="images/code.jpg" alt="QR Code" height="60" width="60">
</td>
</tr></table>
</center>
<!---
<marquee ALIGN="Top" LOOP="infinite" BGCOLOR="#FFFFFF" DIRECTION="left" HEIGHT=16 WIDTH=640 scrollamount="3" scrolldelay="1"><FONT SIZE="3" FACE="courier" COLOR=blue>
<a href="./bridge.php">第三届健桥医学峰会即将在波士顿召开, 更多的信息请点击这里!</a>
</font></marquee>
--->
<table width="100%" style="text-shadow: 1px 1px 3px #999;"><tr><td width="3%"></td><td width="16%"><a href="loading/specialty/1.php">Allergy & Immunology</a>(20)<br><a href="loading/specialty/2.php">Anesthesiology</a>(595)<br><a href="loading/specialty/3.php">Cardiology</a>(129)<br><a href="loading/specialty/38.php">Dentistry</a>(437)<br><a href="loading/specialty/4.php">Dermatology</a>(29)<br><a href="loading/specialty/5.php">Emergency Medicine</a>(8)<br><a href="loading/specialty/6.php">Endocrinoloy</a>(60)<br><a href="loading/specialty/7.php">Family Practice</a>(434)<br><a href="loading/specialty/8.php">Gastroenterology</a>(88)<br><a href="loading/specialty/9.php">General Surgery</a>(94)<br><a href="loading/specialty/10.php">Geriatric Medicine</a>(48)<br><a href="loading/specialty/11.php">Hem/Onc</a>(295)<br><a href="loading/specialty/12.php">Infectious Disease</a>(12)<br><a href="loading/specialty/13.php">Internal Medicine</a>(1880)<br><a href="loading/specialty/36.php">Medical Genetics</a>(47)<br><br></td><td width="3%"></td><td width="16%"><a href="loading/specialty/14.php">Nephrology</a>(114)<br><a href="loading/specialty/15.php">Neurology</a>(333)<br><a href="loading/specialty/16.php">Neurosurgery</a>(11)<br><a href="loading/specialty/17.php">OB/GYN</a>(83)<br><a href="loading/specialty/18.php">Occupational Med</a>(33)<br><a href="loading/specialty/19.php">Ophthalmology</a>(63)<br><a href="loading/specialty/41.php">Optometry</a>(28)<br><a href="loading/specialty/20.php">Orthopaedics</a>(18)<br><a href="loading/specialty/21.php">Otolaryngology</a>(10)<br><a href="loading/specialty/22.php">Pathology</a>(1235)<br><a href="loading/specialty/23.php">Pediatrics</a>(300)<br><a href="loading/specialty/42.php">Pediatric Hem/Onc</a>(8)<br><a href="loading/specialty/43.php">Pediatric Cardiology</a>(9)<br><a href="loading/specialty/44.php">Pediatric GI</a>(9)<br><a href="loading/specialty/45.php">Pediatric Neurology</a>(13)<br><br></td><td width="3%"></td><td width="17%"><a href="loading/specialty/46.php">Pediatric Endocrinology</a>(10)<br><a href="loading/specialty/47.php">Pediatric Rheumatology</a>(2)<br><a href="loading/specialty/48.php">Pediatric Allergy(Immu)</a>(5)<br><a href="loading/specialty/25.php">Plastic Surgery</a>(5)<br><a href="loading/specialty/26.php">Psychiatry</a>(319)<br><a href="loading/specialty/27.php">Pulmonary Disease</a>(30)<br><a href="loading/specialty/28.php">Radiation Oncology</a>(54)<br><a href="loading/specialty/29.php">Diag Radiology(Nuclear)</a>(156)<br><a href="loading/specialty/24.php">Rehabilitation</a>(216)<br><a href="loading/specialty/40.php">Sports Medicine</a>(2)<br><a href="loading/specialty/30.php">Rheumatology</a>(43)<br><a href="loading/specialty/37.php">Thoracic Surgery</a>(17)<br><a href="loading/specialty/32.php">Urology</a>(12)<br><a href="contact.php">Add New Specialty</a><br><a href="contact.php">Add New Specialty</a><br><br></td><td width="3%"></td><td width="20%"><script type="text/javascript"><!--
google_ad_client = "ca-pub-6867265085889194";
/* spe-120-240 */
google_ad_slot = "2416199460";
google_ad_width = 120;
google_ad_height = 240;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
<br>
</td></tr></table><div id="top5">
<html>
<head>
<link rel="stylesheet" type="text/css" href="default.css" />
</head>
<body>
<center>
<div id="ad">
<script type="text/javascript"><!--
google_ad_client = "pub-6867265085889194";
/* 728x90, created 8/26/10 */
google_ad_slot = "7083487992";
google_ad_width = 715;
//google_ad_width = 728;
google_ad_height = 90;
//-->
</script>
<script type="text/javascript"
src="http://pagead2.googlesyndication.com/pagead/show_ads.js">
</script>
</div>
</center>
</body>
</html>
</div>
<center>
<table border="0" BORDERCOLOR=orange><tr>
<td><div style="padding: 2px; border: 1px red solid;"></div></td>
<td><a href=http://www.wikilips.com/ target="_blank"><img src="http://physician.cmgforum.net/images/wikilips.jpg" border="1" width="160" height="80"></a></td>
<td><a href=http://www.linkedin.com/groups?gid=1648407&trk=myg_ugrp_ovr target="_blank"><img src="http://physician.cmgforum.net/images/MedicalCareer_150x65.gif" border="1" width="170" height="80"></a></td>
<td><img src="http://physician.cmgforum.net/images/scan.JPG" border="0" width="80" height="80">
<img src="http://physician.cmgforum.net/images/qrcode.png" border="0" width="80" height="80">
</td>
</tr></table>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=iso-8859-1" />
<title>terrafirma1.0 by nodethirtythree</title>
<meta name="keywords" content="" />
<meta name="description" content="" />
<link rel="stylesheet" type="text/css" href="default.css" />
</head>
<body>
<center>
<div id="footer">
© copyright 2007 cmgforum.net. all rights reserved.
Contact: [email protected]
</div>
</div>
</div>
</body>
</html>
As we can see from this, there are two root elements here. In such case, Scrapy can't parse the XPath correctly. Any ideas how to handle this?
Upvotes: 1
Views: 535
Reputation: 46
When you face malformed HTML pages, try to generalize your xpaths since the browser and scrapy don't interpret the page in the same way. In this case, if you want to extract the list of links in the table, try an xpath like this:
//tr//td//a[contains(@href,'/specialty/')]
Upvotes: 3