topical media & game development
professional-sql-04-indexer.php / php
#! /usr/bin/php
<?php
// include shared code
include 'lib/common.php';
include 'lib/db.php';
// clear index tables
query, query = sprintf('TRUNCATE TABLE \%sSEARCH_TERM', DB_TBL_PREFIX);
mysql_query(GLOBALS['DB']);
query, query = sprintf('SELECT TERM_VALUE FROM \%sSEARCH_STOP_WORD', DB_TBL_PREFIX);
query, stop_words = array();
while (result))
{
// since this list will be checked for each word, use term as the array
// key-- isset(
// in_array(<term>, stop_words)
row['TERM_VALUE']] = true;
}
mysql_free_result(ch = curl_init();
// set curl options
curl_setopt(ch, CURLOPT_HEADER, false);
curl_setopt(ch, CURLOPT_USERAGENT, 'Search Engine Indexer');
// fetch list of documents to index
result = mysql_query(GLOBALS['DB']);
while (result))
{
echo 'Processing: ' . ch, CURLOPT_URL, file = curl_exec(file = tidy_repair_string(html = simplexml_load_string(html = @simplexml_load_string(html->head->title)
{
html->head->title;
}
else
{
// use the filename if a title is not found
row['DOCUMENT_URL']);
}
// extract the description
html->head->meta as meta['name']) && description = query = sprintf('INSERT INTO \%sSEARCH_DOCUMENT (DOCUMENT_URL, ' .
'DOCUMENT_TITLE, DESCRIPTION) VALUES ("\%s", "\%s", "\%s")',
DB_TBL_PREFIX,
mysql_real_escape_string(GLOBALS['DB']),
mysql_real_escape_string(GLOBALS['DB']),
mysql_real_escape_string(GLOBALS['DB']));
mysql_query(GLOBALS['DB']);
// retrieve the document's id
GLOBALS['DB']);
// strip HTML tags out from the content
file);
// break content into individual words
foreach (str_word_count(index => word = strtolower(stop_words[query = sprintf('SELECT TERM_ID FROM \%sSEARCH_TERM WHERE ' .
'TERM_VALUE = "\%s"',
DB_TBL_PREFIX,
mysql_real_escape_string(GLOBALS['DB']));
query, result2))
{
// word exists so retrieve its id
list(result2);
}
else
{
// add word to the database
word, query, word_id = mysql_insert_id(result2);
// add the index record
doc_id,
index);
mysql_query(GLOBALS['DB']);
}
}
mysql_free_result(ch);
echo 'Indexing complete.' . "\n";
?>
(C) Æliens
20/2/2008
You may not copy or print any of this material without explicit permission of the author or the publisher.
In case of other copyright issues, contact the author.