Extracting Information from website IMDB always generates an error

Question

I'm trying to use this class to grab IMDB by URL , the class is intending to grab IMDB info from URL EX : http://www.imdb.com/title/tt0371746/

Instead it returns an error : { $param['error'] = "No Title found in Search Results!"; return $param; }

I generate that error when there no movie at the url, so why is it returning error every time? Even when there is a movie at every url i add ?!

This is the class :

time = "NOW()";
}

function getMovieInfo($input) {
    $param = array();
    $imdbUrl = $this->scruburl($input);
    if ($imdbUrl === null) {
        $param['error'] = "No Title found in Search Results!";
        return $param;
    } $content = $this->geturl($imdbUrl);
    if (stripos($content, "") !== false) {
        $param = $this->GrabInfo($content);
        $param['imdb_url'] = $imdbUrl;
    } else {
        $param['error'] = "No Media found on IMDb!";
    } return $param;
}

function scruburl($input) {
    $url = "http://www.google.com/search?q=imdb+" . stripslashes(rawurlencode($input));
    $content = $this->geturl($url);
    $urls = $this->match_all('/.*?/ms', $content, 1);
    if (!isset($urls[0]))
        return null;
    else
        return $urls[0];
}

function geturl($url) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1");
    $content = curl_exec($ch);
    curl_close($ch);
    return $content;
}

function getimage($image) {
    header("Content-type: image/jpeg");
    $imdb_poster = rawurldecode($image);
    $image = curl_init();
    curl_setopt($image, CURLOPT_URL, $imdb_poster);
    curl_setopt($image, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($image, CURLOPT_CONNECTTIMEOUT, 5);
    $data = curl_exec($image);
    curl_close($image);
    return $data;
}

function match_all($regex, $str, $i = 0) {
    if (preg_match_all($regex, $str, $matches) === false)
        return false;
    else
        return $matches[$i];
}

function match($regex, $str, $i = 0) {
    if (preg_match($regex, $str, $match) == 1)
        return $match[$i];
    else
        return false;
}

function GrabInfo($content) {
    $param = array();
    $param['title_id'] = $this->match('//ms', $content, 1);
    $param['title'] = trim($this->match('/(.*?) $.*?/ms', $content, 1));
    $param['type'] = $this->match('/match('/.*?\(.*?([0-9][0-9][0-9][0-9]).*?$.*?/ms', $content, 1));
    $param['rating'] = $this->match('/([0-9].[0-9])/m', $content, 1);
    $param['ratingcount'] = $this->match('/(.*?)/m', $content, 1);
    $param['reviewcount'] = $this->match('/(.*?)/m', $content, 1);
    $param['trailer'] = $this->match('|match_all('/(.*?)/ms', $this->match('/Director.?:(.*?)(|>.?and )/ms', $content, 1), 1) as $m) {
        array_push($param['directors'], $m);
    } $param[directors] = is_array(($param[directors])) ? implode(", ", ($param[directors])) : ($param[directors]);
    $param['writers'] = array();
    foreach ($this->match_all('/(.*?)/ms', $this->match('/Writer.?:(.*?)(|>.?and )/ms', $content, 1), 1) as $m) {
        array_push($param['writers'], $m);
    } $param[writers] = is_array(($param[writers])) ? implode(", ", ($param[writers])) : ($param[writers]);
    $param['stars'] = array();
    foreach ($this->match_all('/(.*?)/ms', $this->match('/Stars:(.*?)/ms', $content, 1), 1) as $m) {
        array_push($param['stars'], $m);
    } $param[stars] = is_array(($param[stars])) ? implode(", ", ($param[stars])) : ($param[stars]);
    $param['cast'] = array();
    foreach ($this->match_all('/(.*?)/ms', $content, 1) as $m) {
        array_push($param['cast'], trim(strip_tags($m)));
    } $param[cast] = is_array(($param[cast])) ? implode(", ", ($param[cast])) : ($param[cast]);
    $param['mpaa_rating'] = $this->match('/infobar">./ms', $content, 1);
    if ($param['title_id'] != "") {
        $releaseinfoHtml = $this->geturl("http://www.imdb.com/title/" . $param['title_id'] . "/releaseinfo");
        $param['also_known_as'] = $this->getAkaTitles($releaseinfoHtml, $usa_title);
        $param[also_known_as] = is_array(($param[also_known_as])) ? implode("
", ($param[also_known_as])) : ($param[also_known_as]);
        $param['usa_title'] = $usa_title;
        $param['release_date'] = $this->match('/Release Date:.*?([0-9][0-9]? (January|February|March|April|May|June|July|August|September|October|November|December) (19|20)[0-9][0-9]).*?(\(|getReleaseDates($releaseinfoHtml);
        $param[release_dates] = is_array(($param[release_dates])) ? implode("
", ($param[release_dates])) : ($param[release_dates]);
    } $param['plot'] = trim(strip_tags($this->match('/Users:.*?(.*?)(
|match('/img_primary">.*?/ms', $content, 1);
    $param['poster_large'] = "";
    $param['poster_small'] = "";
    if ($param['poster'] != '' && strrpos($param['poster'], "nopicture") === false && strrpos($param['poster'], "ad.doubleclick") === false) {
        $param['poster_large'] = substr($param['poster'], 0, strrpos($param['poster'], "_V1.")) . "_V1._SY500.jpg";
        $param['poster_small'] = substr($param['poster'], 0, strrpos($param['poster'], "_V1.")) . "_V1._SY150.jpg";
    } else {
        $param['poster'] = "";
    } $param['runtime'] = trim($this->match('/Runtime:.*?([0-9]+) min.*?/ms', $content, 1));
    if ($param['runtime'] == '')
        $param['runtime'] = trim($this->match('/infobar.*?([0-9]+) min.*?/ms', $content, 1));
    $param['oscars'] = trim($this->match('/Won ([0-9]+) Oscars./ms', $content, 1));
    $param['awards'] = trim($this->match('/([0-9]+) wins/ms', $content, 1));
    $param['nominations'] = trim($this->match('/([0-9]+) nominations/ms', $content, 1));
    $param['storyline'] = trim(strip_tags($this->match('/Storyline(.*?)(|match('/Release Date.?:(.*?)(|See more)/ms', $content, 1)));
    $param['keywords'] = array();
    foreach ($this->match_all('/(.*?)/ms', $this->match('/Plot Keywords.?:(.*?)(|See more)/ms', $content, 1), 1) as $m) {
        array_push($param['keywords'], $m);
    } $param[keywords] = is_array(($param[keywords])) ? implode(", ", ($param[keywords])) : ($param[keywords]);
    $param['tagline'] = trim(strip_tags($this->match('/Tagline.?:(.*?)(match('/href="ratings".*?>([0-9]+,?[0-9]*) votes\)/ms', $content, 1);
    $param[votes] = is_array(($param[votes])) ? implode(", ", ($param[votes])) : ($param[votes]);
    $param['languages'] = $this->match_all('/a href="/language/.*?">(.*?)/ms', $content, 1);
    $param['languages'] = array_unique($param['languages']);
    $param[languages] = is_array(($param[languages])) ? implode(", ", ($param[languages])) : ($param[languages]);
    $param['countries'] = array();
    foreach ($this->match_all('/(.*?)/ms', $this->match('/Country.?:(.*?)(|See more)/ms', $content, 1), 1) as $m) {
        array_push($param['countries'], $m);
    } $param[countries] = is_array(($param[countries])) ? implode(", ", ($param[countries])) : ($param[countries]);
    $param['companies'] = $this->match_all('/a.*?href="/company/.*?">(.*?)/ms', $content, 1);
    $param['companies'] = array_unique($param['companies']);
    $param[companies] = is_array(($param[companies])) ? implode(", ", ($param[companies])) : ($param[companies]);
    return $param;
}

function getReleaseDates($content) {
    $releaseDates = array();
    foreach ($this->match_all('/(.*?)/ms', $this->match('/Date(.*?)/ms', $content, 1), 1) as $r) {
        $country = trim(strip_tags($this->match('/(.*?)/ms', $r, 1)));
        $date = trim(strip_tags($this->match('/(.*?)/ms', $r, 1)));
        array_push($releaseDates, $country . " = " . $date);
    } return $releaseDates;
}

function getAkaTitles($content, &$usa_title) {
    $akaTitles = array();
    foreach ($this->match_all('/(.*?)/msi', $this->match('/Also Known As(.*?)/ms', $content, 1), 1) as $m) {
        $akaTitleMatch = $this->match_all('/(.*?)/ms', $m, 1);
        $akaTitle = trim($akaTitleMatch[0]);
        $akaCountry = trim($akaTitleMatch[1]);
        array_push($akaTitles, $akaTitle . " = " . $akaCountry);
        if ($akaCountry != '' && strrpos(strtolower($akaCountry), "usa") !== false)
            $usa_title = $akaTitle;
    } return $akaTitles;
}

}

Alexey · Accepted Answer

There are a few errors behind this:

in IMDBGrabber::scruburl($input) method there is wrong regexp, there may be characters after the double quote and before the http. If I were you, I'd rather use Google custom search engine API to search for it. With the current approach you'll be banned after a few hundreds-thousands attempts. So the fixed regexp would be:

$urls = $this->match_all('/




The condition stripos($content, " you're trying to test seems to be
wrong. I downloaded the html for title/tt0371746/ and there was no
such string. I'd use something like if (stripos($content, "Your rating:") !== false) {



After those two changes your script outputs something like this:

array(34) {
  ["title_id"]=>
  string(9) "tt0371746"
  ["title"]=>
  string(8) "Iron Man"
  ["type"]=>
  string(11) "video.movie"
  ["year"]=>
  string(4) "2008"
  ["rating"]=>
  string(3) "7.9"
  ["ratingcount"]=>
  string(7) "578,477"
  ["reviewcount"]=>
  string(10) "1,017 user"
  ["trailer"]=>
  string(24) "/video/imdb/vi447873305/"
  ["genres"]=>
  string(28) " Action,  Adventure,  Sci-Fi"
  ["directors"]=>
  string(57) "Jon Favreau"
  ["writers"]=>
  string(131) "Mark Fergus, Hawk Ostby, 6 more credits"
  ["stars"]=>
  string(214) "Robert Downey Jr., Gwyneth Paltrow, Terrence Howard,  See full cast and crew"
  ["cast"]=>
  string(0) ""
  ["mpaa_rating"]=>
  bool(false)
  ["also_known_as"]=>
  string(0) ""
  ["usa_title"]=>
  NULL
  ["release_date"]=>
  string(24) "1 May 2008 (Netherlands)"
  ["release_dates"]=>
  string(0) ""
  ["plot"]=>
  string(0) ""
  ["poster"]=>
  string(0) ""
  ["poster_large"]=>
  string(0) ""
  ["poster_small"]=>
  string(0) ""
  ["runtime"]=>
  string(3) "126"
  ["oscars"]=>
  string(0) ""
  ["awards"]=>
  string(2) "18"
  ["nominations"]=>
  string(2) "51"
  ["storyline"]=>
  string(856) "Tony Stark. Genius, billionaire, playboy, philanthropist. Son of legendary inventor and weapons contractor Howard Stark. When Tony Stark is assigned to give a weapons presentation to an Iraqi unit led by Lt. Col. James Rhodes, he's given a ride on enemy lines. That ride ends badly when Stark's Humvee that he's riding in is attacked by enemy combatants. He survives - barely - with a chest full of shrapnel and a car battery attached to his heart. In order to survive he comes up with a way to miniaturize the battery and figures out that the battery can power something else. Thus Iron Man is born. He uses the primitive device to escape from the cave in Iraq. Once back home, he then begins work on perfecting the Iron Man suit. But the man who was put in charge of Stark Industries has plans of his own to take over Tony's technology for other matters."
  ["keywords"]=>
  string(304) " armor,  cave,  iron,  genius,  missile, See All (198)"
  ["tagline"]=>
  string(52) "Get ready for a different breed of heavy metal hero."
  ["votes"]=>
  bool(false)
  ["languages"]=>
  string(153) "|
        Persian, |
        Arabic"
  ["countries"]=>
  string(3) "USA"
  ["companies"]=>
  string(75) "Paramount Pictures, Marvel Enterprises, Marvel Studios"
  ["imdb_url"]=>
  string(36) "http://www.imdb.com/title/tt0371746/"
}


with a bunch of PHP notices. 

But this all is a nightmare to maintain and I have to say, the code is not best of the best. Consider using this approach or a parser based on XML/xpath addressing. See this as well.

Extracting Information from website IMDB always generates an error

Answers (1)

Related Questions