Reputation: 153
I understand this isn't exactly the smallest code, i've tried to cut it down as much as i could. The script just consumes more and more memory until it finally runs out. I've used unset() where possible but it doesn't seem to have any effect. It always seems to error in the MultiGet function but i'm not sure if that is where the leak is. Any input would be greatly appreciated.
public function Test()
{
$base = dirname(__FILE__) .'/';
$prod_file = $base.'products.dbf';
$this->dbf->load($prod_file);
$num_rec=$ci->dbf->dbf_num_rec;
$buffer = Array();
for($i=0;$i<$num_rec;$i++):
$row = $ci->dbf->getRowAssoc($i);
$info = Array('part_number' => $row['PART_NUM'],
'td_group_id' => $row['GRP'],
'name' => 'DESCR');
$this->db->where('td_group_id',$info['td_group_id']);
$result = $this->db->get('tbl_categories')->row_array();
if(isset($result['id'])):
$info['category_id'] = $result['id'];
$buffer[] = $info;
endif;
if(count($buffer) == 100 || $i == $num_rec -1):
$url_buffer = Array();
foreach($buffer as $row):
$url_buffer[] = $this->_product_url($row['part_number']);
endforeach;
$html_returns = $this->MultiCrawl($url_buffer);
foreach($html_returns as $url_index=>$html):
$more_info = $this->_extract_more_info($html);
if($more_info):
$more_info['category_id'] = $buffer[$url_index]['category_id'];
$more_info['td_part_number'] = $buffer[$url_index]['part_number'];
$this->_parse_product($more_info);
endif;
endforeach;
$buffer = Array();
endif;
endfor;
}
function MultiGet($all_urls)
{
$useragent = $this->_useragent;
$cookie_file = $this->_cookie_file;
$url_index = $this->UrlIndex($all_urls);
$return_buffer = Array();
$mh = curl_multi_init();
$ch = Array();
$max_connections = 15;
$index = 0;
$open_connections = 0;
$execReturnValue = true;
$running = true;
$max_index = count($all_urls)-1;
$url_count = count($all_urls);
$buffer_count = 0;
while ($buffer_count < $url_count){
if($open_connections < $max_connections && $index <= $max_index):
for($i=$open_connections;$i<$max_connections && $index <= $max_index;$i++):
$url = $all_urls[$index];
$ch[$index] = curl_init($url);
curl_setopt($ch[$index],CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch[$index],CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch[$index],CURLOPT_COOKIESESSION, false);
curl_setopt($ch[$index],CURLOPT_SSL_VERIFYHOST , false);
curl_setopt($ch[$index],CURLOPT_SSL_VERIFYPEER , false);
curl_setopt($ch[$index],CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch[$index],CURLOPT_COOKIEFILE, $cookie_file);
curl_setopt($ch[$index],CURLOPT_USERAGENT,$useragent);
curl_multi_add_handle($mh, $ch[$index]);
$open_connections++;
$index++;
$execReturnValue = curl_multi_exec($mh,$running);
usleep(200);
endfor;
endif;
$execReturnValue = curl_multi_exec($mh,$running);
$ready=curl_multi_select($mh);
while($info=curl_multi_info_read($mh)){
$status=curl_getinfo($info['handle'],CURLINFO_HTTP_CODE);
if($status==200){
$successUrl=curl_getinfo($info['handle'],CURLINFO_EFFECTIVE_URL);
$curl_index = $url_index[$successUrl];
$return_buffer[$curl_index] = curl_multi_getcontent($ch[$curl_index]);
$buffer_count = count($return_buffer);
curl_multi_remove_handle($mh, $ch[$curl_index]);
curl_close($ch[$curl_index]);
unset($ch[$curl_index]);
$open_connections--;
}else{
echo "ERROR: $status\n";
}
}
}
curl_multi_close($mh);
unset($mh);
return $return_buffer;
}
private function _extract_more_info($html)
{
$buffer = array();
$query = "//img[@id='ctl00_cphMain_cntrlProductProfile_imgprodimage']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['td_img_url'] = $node?trim($node->getAttribute('src')):null;
unset($result);
$query = "//span[@class='priceLarge']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['price'] = $node?trim($node->nodeValue):null;
if($buffer['price'] == 'Req. Auth.') return null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLtFinalPrice']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['msrp'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLTMRF']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['manf_part_number'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//span[@id='ctl00_cphMain_cntrlProductProfile_newLblUPC']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
$buffer['upc_part_number'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//td[@class='black_text_WUL']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['manufacturer'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//td[@class='textt' and @colspan='3']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['short_description'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//div[@id='ctl00_cphMain_pnlMarketingDesc']//td[@class='textt']";
$result = $this->_xquery($html,$query);
$node = $result instanceof DOMNode?$this->_to_dom_node($result):null;
if(!$node) return null;
$buffer['long_description'] = $node?trim($node->nodeValue):null;
unset($result);
$query = "//table[@id='ctl00_cphMain_cntrlMainSpecs_dgSpecs']";
$result = $this->_xquery($html,$query);
$table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
unset($result);
if(!$table) return null;
$table_array = Array();
$rows = $table->getElementsByTagName('tr');
foreach($rows as $tr):
$temp = Array();
$columns = $tr->getElementsByTagName('td');
$caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
$value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;
if ($caption) $table_array[$caption] = $value;
endforeach;
$buffer['main_specs']=$table_array;
$query = "//table[@id='ctl00_cphMain_cntrlExtSpecs_tblData']";
$result = $this->_xquery($html,$query);
$table = $result instanceof DOMNode?$this->_to_dom_node($result):null;
unset($result);
$buffer['additional_specs'] = null;
if(!$table) return $buffer;
$table_array = Array();
$rows = $table->getElementsByTagName('tr');
foreach($rows as $tr):
$temp = Array();
$columns = $tr->getElementsByTagName('td');
$caption = $columns->length > 0 && $columns->length <= 2 ? trim($columns->item(0)->nodeValue) : null;
$value = $columns->length == 2 ? trim($columns->item(1)->nodeValue) : null;
if ($caption) $table_array[$caption] = $value;
endforeach;
$buffer['additional_specs']=$table_array;;
return $buffer;
}
private function _xquery($html,$query,$allnodes = false){
$src = '';
$dom = new DOMDocument();
$node = null;
if (@$dom->loadHTML($html)) {
$xpath = new DOMXpath($dom);
$nodeList = $xpath->query($query);
if ($nodeList->length > 0) {
$node = $allnodes==false?$nodeList->item(0):$nodeList;
}
}
unset($xpath);
unset($nodeList);
unset($dom);
return $node;
}
Upvotes: 0
Views: 708
Reputation: 41519
Strategies to find a leak?
foo
is O(n), bar
is O(n) and bar
calls foo
, the result may become O(n*n).At first sight, you're crawling a series of url's. These may contain more url's, to be crawled using the MultiCrawl
method. Are you sure there can't be a cycle in there? (working with folders has tricked me more than once: browsing '.' as a subfolder yields infinite loops)
Upvotes: 1