Reputation: 139862
Here is an excerpt function:
function excerpt($text, $phrase, $radius = 100, $ending = "...") {
270 if (empty($text) or empty($phrase)) {
271 return $this->truncate($text, $radius * 2, $ending);
272 }
273
274 $phraseLen = strlen($phrase);
275 if ($radius < $phraseLen) {
276 $radius = $phraseLen;
277 }
278
279 $pos = strpos(strtolower($text), strtolower($phrase));
280
281 $startPos = 0;
282 if ($pos > $radius) {
283 $startPos = $pos - $radius;
284 }
285
286 $textLen = strlen($text);
287
288 $endPos = $pos + $phraseLen + $radius;
289 if ($endPos >= $textLen) {
290 $endPos = $textLen;
291 }
292
293 $excerpt = substr($text, $startPos, $endPos - $startPos);
294 if ($startPos != 0) {
295 $excerpt = substr_replace($excerpt, $ending, 0, $phraseLen);
296 }
297
298 if ($endPos != $textLen) {
299 $excerpt = substr_replace($excerpt, $ending, -$phraseLen);
300 }
301
302 return $excerpt;
303 }
Its drawback is that it doesn't try to match as many searched words as possible,which only matches once by default.
How to implement the desired one?
Upvotes: 2
Views: 2779
Reputation: 174
I could not contact erisco, so I am posting his function with multiple fixes (most importantly multibyte support).
/**
* @param string $text text to be searched
* @param string $phrase search string
* @param int $span approximate length of the excerpt
* @param string $delimiter string to use as a suffix and/or prefix if the excerpt is from the middle of a text
*
* @return string
*/
public static function excerpt($text, $phrase, $span = 100, $delimiter = '...')
{
$phrases = preg_split('/\s+/u', $phrase);
$regexp = '/\b(?:';
foreach($phrases as $phrase)
{
$regexp.= preg_quote($phrase, '/') . '|';
}
$regexp = mb_substr($regexp, 0, -1) .')\b/ui';
$matches = [];
preg_match_all($regexp, $text, $matches, PREG_OFFSET_CAPTURE);
$matches = $matches[0];
$nodes = [];
foreach($matches as $match)
{
$node = new stdClass;
$node->phraseLength = mb_strlen($match[0]);
$node->position = mb_strlen(substr($text, 0, $match[1])); // calculate UTF-8 position (@see https://bugs.php.net/bug.php?id=67487)
$nodes[] = $node;
}
if(count($nodes) > 0)
{
$clust = new stdClass;
$clust->nodes[] = array_shift($nodes);
$clust->length = $clust->nodes[0]->phraseLength;
$clust->i = 0;
$clusters = new stdClass;
$clusters->data =
[
$clust
];
$clusters->i = 0;
foreach($nodes as $node)
{
$lastClust = $clusters->data[$clusters->i];
$lastNode = $lastClust->nodes[$lastClust->i];
$addedLength = $node->position - $lastNode->position - $lastNode->phraseLength + $node->phraseLength;
if($lastClust->length + $addedLength <= $span)
{
$lastClust->nodes[] = $node;
$lastClust->length+= $addedLength;
$lastClust->i++;
}
else
{
if($addedLength > $span)
{
$newClust = new stdClass;
$newClust->nodes =
[
$node
];
$newClust->i = 0;
$newClust->length = $node->phraseLength;
$clusters->data[] = $newClust;
$clusters->i++;
}
else
{
$newClust = clone $lastClust;
while($newClust->length + $addedLength > $span)
{
$shiftedNode = array_shift($newClust->nodes);
if($shiftedNode === null)
{
break;
}
$newClust->i--;
$removedLength = $shiftedNode->phraseLength;
if(isset($newClust->nodes[0]))
{
$removedLength+= $newClust->nodes[0]->position - $shiftedNode->position;
}
$newClust->length-= $removedLength;
}
if($newClust->i < 0)
{
$newClust->i = 0;
}
$newClust->nodes[] = $node;
$newClust->length+= $addedLength;
$clusters->data[] = $newClust;
$clusters->i++;
}
}
}
$bestClust = $clusters->data[0];
$bestClustSize = count($bestClust->nodes);
foreach($clusters->data as $clust)
{
$newClustSize = count($clust->nodes);
if($newClustSize > $bestClustSize)
{
$bestClust = $clust;
$bestClustSize = $newClustSize;
}
}
$clustLeft = $bestClust->nodes[0]->position;
$clustLen = $bestClust->length;
$padding = intval(round(($span - $clustLen) / 2));
$clustLeft-= $padding;
if($clustLeft < 0)
{
$clustLen+= $clustLeft * -1 + $padding;
$clustLeft = 0;
}
else
{
$clustLen+= $padding * 2;
}
}
else
{
$clustLeft = 0;
$clustLen = $span;
}
$textLen = mb_strlen($text);
$prefix = '';
$suffix = '';
if($clustLeft > 0 && !ctype_space(mb_substr($text, $clustLeft, 1))
&& !ctype_space(mb_substr($text, $clustLeft - 1, 1)))
{
$clustLeft++;
while(!ctype_space(mb_substr($text, $clustLeft, 1)))
{
$clustLeft++;
}
$prefix = $delimiter;
}
$lastChar = $clustLeft + $clustLen;
if($lastChar < $textLen && !ctype_space(mb_substr($text, $lastChar, 1))
&& !ctype_space(mb_substr($text, $lastChar + 1, 1)))
{
$lastChar--;
while(!ctype_space(mb_substr($text, $lastChar, 1)))
{
$lastChar--;
}
$suffix = $delimiter;
$clustLen = $lastChar - $clustLeft;
}
if($clustLeft > 0)
{
$prefix = $delimiter;
}
if($clustLeft + $clustLen < $textLen)
{
$suffix = $delimiter;
}
return $prefix . trim(mb_substr($text, $clustLeft, $clustLen + 1)) . $suffix;
}
Upvotes: 0
Reputation: 897
I came up with the below to generate excerpts. You can see the code here https://github.com/boyter/php-excerpt It works by finding all the locations of the matching words, then takes an excerpt based on which words are the closest. In theory this does not sound very good but in practice it works very well.
Its actually very close to how Sphider (for the record it lives in searchfuncs.php from line 529 to 566) generates its snippets. I think the below is much easier to read and is without bugs which exist in Sphider. It also does not use regular expressions which makes it a bit faster then other methods I have used.
I blogged about it here http://www.boyter.org/2013/04/building-a-search-result-extract-generator-in-php/
<?php
// find the locations of each of the words
// Nothing exciting here. The array_unique is required
// unless you decide to make the words unique before passing in
function _extractLocations($words, $fulltext) {
$locations = array();
foreach($words as $word) {
$wordlen = strlen($word);
$loc = stripos($fulltext, $word);
while($loc !== FALSE) {
$locations[] = $loc;
$loc = stripos($fulltext, $word, $loc + $wordlen);
}
}
$locations = array_unique($locations);
sort($locations);
return $locations;
}
// Work out which is the most relevant portion to display
// This is done by looping over each match and finding the smallest distance between two found
// strings. The idea being that the closer the terms are the better match the snippet would be.
// When checking for matches we only change the location if there is a better match.
// The only exception is where we have only two matches in which case we just take the
// first as will be equally distant.
function _determineSnipLocation($locations, $prevcount) {
// If we only have 1 match we dont actually do the for loop so set to the first
$startpos = $locations[0];
$loccount = count($locations);
$smallestdiff = PHP_INT_MAX;
// If we only have 2 skip as its probably equally relevant
if(count($locations) > 2) {
// skip the first as we check 1 behind
for($i=1; $i < $loccount; $i++) {
if($i == $loccount-1) { // at the end
$diff = $locations[$i] - $locations[$i-1];
}
else {
$diff = $locations[$i+1] - $locations[$i];
}
if($smallestdiff > $diff) {
$smallestdiff = $diff;
$startpos = $locations[$i];
}
}
}
$startpos = $startpos > $prevcount ? $startpos - $prevcount : 0;
return $startpos;
}
// 1/6 ratio on prevcount tends to work pretty well and puts the terms
// in the middle of the extract
function extractRelevant($words, $fulltext, $rellength=300, $prevcount=50, $indicator='...') {
$textlength = strlen($fulltext);
if($textlength <= $rellength) {
return $fulltext;
}
$locations = _extractLocations($words, $fulltext);
$startpos = _determineSnipLocation($locations,$prevcount);
// if we are going to snip too much...
if($textlength-$startpos < $rellength) {
$startpos = $startpos - ($textlength-$startpos)/2;
}
$reltext = substr($fulltext, $startpos, $rellength);
// check to ensure we dont snip the last word if thats the match
if( $startpos + $rellength < $textlength) {
$reltext = substr($reltext, 0, strrpos($reltext, " ")).$indicator; // remove last word
}
// If we trimmed from the front add ...
if($startpos != 0) {
$reltext = $indicator.substr($reltext, strpos($reltext, " ") + 1); // remove first word
}
return $reltext;
}
?>
Upvotes: 6
Reputation: 14329
The code listed here thus far has not worked for me so I spent some time thinking of an algorithm to implement. What I have now works decently, and it does not appear to be a performance problem - feel free to test. Results are not as snazzy Google's snippets as there is no detection for where sentences start and end. I could add this but it'd be that much more complicated and I'd have to throw in the towel on doing this in a single function. Already its getting crowded and could be better coded if, for example, the object manipulations were abstracted to methods.
Anyhow, this is what I have and it should be a good start. The most dense excerpt is determined and the resulting string will approximately be the span you have specified. I urge some testing of this code as I have not done a thorough job of it. Surely there are problematic cases to be found.
I also encourage anyone to improve on this algorithm, or simply the code to execute it.
Enjoy.
// string excerpt(string $text, string $phrase, int $span = 100, string $delimiter = '...')
// parameters:
// $text - text to be searched
// $phrase - search string
// $span - approximate length of the excerpt
// $delimiter - string to use as a suffix and/or prefix if the excerpt is from the middle of a text
function excerpt($text, $phrase, $span = 100, $delimiter = '...') {
$phrases = preg_split('/\s+/', $phrase);
$regexp = '/\b(?:';
foreach ($phrases as $phrase) {
$regexp .= preg_quote($phrase, '/') . '|';
}
$regexp = substr($regexp, 0, -1) . ')\b/i';
$matches = array();
preg_match_all($regexp, $text, $matches, PREG_OFFSET_CAPTURE);
$matches = $matches[0];
$nodes = array();
foreach ($matches as $match) {
$node = new stdClass;
$node->phraseLength = strlen($match[0]);
$node->position = $match[1];
$nodes[] = $node;
}
if (count($nodes) > 0) {
$clust = new stdClass;
$clust->nodes[] = array_shift($nodes);
$clust->length = $clust->nodes[0]->phraseLength;
$clust->i = 0;
$clusters = new stdClass;
$clusters->data = array($clust);
$clusters->i = 0;
foreach ($nodes as $node) {
$lastClust = $clusters->data[$clusters->i];
$lastNode = $lastClust->nodes[$lastClust->i];
$addedLength = $node->position - $lastNode->position - $lastNode->phraseLength + $node->phraseLength;
if ($lastClust->length + $addedLength <= $span) {
$lastClust->nodes[] = $node;
$lastClust->length += $addedLength;
$lastClust->i += 1;
} else {
if ($addedLength > $span) {
$newClust = new stdClass;
$newClust->nodes = array($node);
$newClust->i = 0;
$newClust->length = $node->phraseLength;
$clusters->data[] = $newClust;
$clusters->i += 1;
} else {
$newClust = clone $lastClust;
while ($newClust->length + $addedLength > $span) {
$shiftedNode = array_shift($newClust->nodes);
if ($shiftedNode === null) {
break;
}
$newClust->i -= 1;
$removedLength = $shiftedNode->phraseLength;
if (isset($newClust->nodes[0])) {
$removedLength += $newClust->nodes[0]->position - $shiftedNode->position;
}
$newClust->length -= $removedLength;
}
if ($newClust->i < 0) {
$newClust->i = 0;
}
$newClust->nodes[] = $node;
$newClust->length += $addedLength;
$clusters->data[] = $newClust;
$clusters->i += 1;
}
}
}
$bestClust = $clusters->data[0];
$bestClustSize = count($bestClust->nodes);
foreach ($clusters->data as $clust) {
$newClustSize = count($clust->nodes);
if ($newClustSize > $bestClustSize) {
$bestClust = $clust;
$bestClustSize = $newClustSize;
}
}
$clustLeft = $bestClust->nodes[0]->position;
$clustLen = $bestClust->length;
$padding = round(($span - $clustLen)/2);
$clustLeft -= $padding;
if ($clustLeft < 0) {
$clustLen += $clustLeft*-1 + $padding;
$clustLeft = 0;
} else {
$clustLen += $padding*2;
}
} else {
$clustLeft = 0;
$clustLen = $span;
}
$textLen = strlen($text);
$prefix = '';
$suffix = '';
if (!ctype_space($text[$clustLeft]) && isset($text[$clustLeft-1]) && !ctype_space($text[$clustLeft-1])) {
while (!ctype_space($text[$clustLeft])) {
$clustLeft += 1;
}
$prefix = $delimiter;
}
$lastChar = $clustLeft + $clustLen;
if (!ctype_space($text[$lastChar]) && isset($text[$lastChar+1]) && !ctype_space($text[$lastChar+1])) {
while (!ctype_space($text[$lastChar])) {
$lastChar -= 1;
}
$suffix = $delimiter;
$clustLen = $lastChar - $clustLeft;
}
if ($clustLeft > 0) {
$prefix = $delimiter;
}
if ($clustLeft + $clustLen < $textLen) {
$suffix = $delimiter;
}
return $prefix . trim(substr($text, $clustLeft, $clustLen+1)) . $suffix;
}
Upvotes: 6
Reputation: 1
function excerpt($text, $phrase, $radius = 100, $ending = "...") {
$phraseLen = strlen($phrase);
if ($radius < $phraseLen) {
$radius = $phraseLen;
}
$phrases = explode (' ',$phrase);
foreach ($phrases as $phrase) {
$pos = strpos(strtolower($text), strtolower($phrase));
if ($pos > -1) break;
}
$startPos = 0;
if ($pos > $radius) {
$startPos = $pos - $radius;
}
$textLen = strlen($text);
$endPos = $pos + $phraseLen + $radius;
if ($endPos >= $textLen) {
$endPos = $textLen;
}
$excerpt = substr($text, $startPos, $endPos - $startPos);
if ($startPos != 0) {
$excerpt = substr_replace($excerpt, $ending, 0, $phraseLen);
}
if ($endPos != $textLen) {
$excerpt = substr_replace($excerpt, $ending, -$phraseLen);
}
return $excerpt; }
Upvotes: 0