search_index
modules/search/search.module, строка 451
- Версии
- 5 – 6
search_index($sid, $type, $text)
Update the full-text search index for a particular item.
Параметры
$sid
A number identifying this particular item (e.g. node id).
$type
A string defining this type of item (e.g. 'node'
)
$text
The content of this item. Must be a piece of HTML text.
Связанные темы
Код
<?php
function search_index($sid, $type, $text) {
$minimum_word_size = variable_get('minimum_word_size', 3);
// Link matching
global $base_url;
$node_regexp = '@href=[\'"]?(?:'. preg_quote($base_url, '@') .'/|'. preg_quote(base_path(), '@') .')(?:\?q=)?/?((?![a-z]+:)[^\'">]+)[\'">]@i';
// Multipliers for scores of words inside certain HTML tags.
// Note: 'a' must be included for link ranking to work.
$tags = array('h1' => 25,
'h2' => 18,
'h3' => 15,
'h4' => 12,
'h5' => 9,
'h6' => 6,
'u' => 3,
'b' => 3,
'i' => 3,
'strong' => 3,
'em' => 3,
'a' => 10);
// Strip off all ignored tags to speed up processing, but insert space before/after
// them to keep word boundaries.
$text = str_replace(array('<', '>'), array(' <', '> '), $text);
$text = strip_tags($text, '<'. implode('><', array_keys($tags)) .'>');
// Split HTML tags from plain text.
$split = preg_split('/\s*<([^>]+?)>\s*/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);
// Note: PHP ensures the array consists of alternating delimiters and literals
// and begins and ends with a literal (inserting $null as required).
$tag = FALSE; // Odd/even counter. Tag or no tag.
$link = FALSE; // State variable for link analyser
$score = 1; // Starting score per word
$accum = ' '; // Accumulator for cleaned up data
$tagstack = array(); // Stack with open tags
$tagwords = 0; // Counter for consecutive words
$focus = 1; // Focus state
$results = array(0 => array()); // Accumulator for words for index
foreach ($split as $value) {
if ($tag) {
// Increase or decrease score per word based on tag
list($tagname) = explode(' ', $value, 2);
$tagname = drupal_strtolower($tagname);
// Closing or opening tag?
if ($tagname[0] == '/') {
$tagname = substr($tagname, 1);
// If we encounter unexpected tags, reset score to avoid incorrect boosting.
if (!count($tagstack) || $tagstack[0] != $tagname) {
$tagstack = array();
$score = 1;
}
else {
// Remove from tag stack and decrement score
$score = max(1, $score - $tags[array_shift($tagstack)]);
}
if ($tagname == 'a') {
$link = FALSE;
}
}
else {
if ($tagstack[0] == $tagname) {
// None of the tags we look for make sense when nested identically.
// If they are, it's probably broken HTML.
$tagstack = array();
$score = 1;
}
else {
// Add to open tag stack and increment score
array_unshift($tagstack, $tagname);
$score += $tags[$tagname];
}
if ($tagname == 'a') {
// Check if link points to a node on this site
if (preg_match($node_regexp, $value, $match)) {
$path = drupal_get_normal_path($match[1]);
if (preg_match('!(?:node|book)/(?:view/)?([0-9]+)!i', $path, $match)) {
$linknid = $match[1];
if ($linknid > 0) {
// Note: ignore links to uncachable nodes to avoid redirect bugs.
$node = db_fetch_object(db_query('SELECT n.title, n.nid, n.vid, r.format FROM {node} n INNER JOIN {node_revisions} r ON n.vid = r.vid WHERE n.nid = %d', $linknid));
if (filter_format_allowcache($node->format)) {
$link = TRUE;
$linktitle = $node->title;
}
}
}
}
}
}
// A tag change occurred, reset counter.
$tagwords = 0;
}
else {
// Note: use of PREG_SPLIT_DELIM_CAPTURE above will introduce empty values
if ($value != '') {
if ($link) {
// Check to see if the node link text is its URL. If so, we use the target node title instead.
if (preg_match('!^https?://!i', $value)) {
$value = $linktitle;
}
}
$words = search_index_split($value);
foreach ($words as $word) {
// Add word to accumulator
$accum .= $word .' ';
$num = is_numeric($word);
// Check wordlength
if ($num || drupal_strlen($word) >= $minimum_word_size) {
// Normalize numbers
if ($num) {
$word = (int)ltrim($word, '-0');
}
if ($link) {
if (!isset($results[$linknid])) {
$results[$linknid] = array();
}
$results[$linknid][$word] += $score * $focus;
}
else {
$results[0][$word] += $score * $focus;
// Focus is a decaying value in terms of the amount of unique words up to this point.
// From 100 words and more, it decays, to e.g. 0.5 at 500 words and 0.3 at 1000 words.
$focus = min(1, .01 + 3.5 / (2 + count($results[0]) * .015));
}
}
$tagwords++;
// Too many words inside a single tag probably mean a tag was accidentally left open.
if (count($tagstack) && $tagwords >= 15) {
$tagstack = array();
$score = 1;
}
}
}
}
$tag = !$tag;
}
search_wipe($sid, $type, TRUE);
// Insert cleaned up data into dataset
db_query("INSERT INTO {search_dataset} (sid, type, data) VALUES (%d, '%s', '%s')", $sid, $type, $accum);
// Insert results into search index
foreach ($results[0] as $word => $score) {
db_query("INSERT INTO {search_index} (word, sid, type, score) VALUES ('%s', %d, '%s', %f)", $word, $sid, $type, $score);
search_dirty($word);
}
unset($results[0]);
// Now insert links to nodes
foreach ($results as $nid => $words) {
foreach ($words as $word => $score) {
db_query("INSERT INTO {search_index} (word, sid, type, fromsid, fromtype, score) VALUES ('%s', %d, '%s', %d, '%s', %f)", $word, $nid, 'node', $sid, $type, $score);
search_dirty($word);
}
}
}
?>
Войдите или зарегистрируйтесь, чтобы получить возможность отправлять комментарии