professional-sql-04-indexer.php / php
#! /usr/bin/php <?php // include shared code include 'lib/common.php'; include 'lib/db.php'; // clear index tables\ %sSEARCH_INDEX', DB_TBL_PREFIX); mysql_query(query, query = sprintf('TRUNCATE TABLE\ %sSEARCH_TERM', DB_TBL_PREFIX); mysql_query(GLOBALS['DB']);\ %sSEARCH_DOCUMENT', DB_TBL_PREFIX); mysql_query(query, // retrieve the list of stop words query = sprintf('SELECT TERM_VALUE FROM\ %sSEARCH_STOP_WORD', DB_TBL_PREFIX); query, stop_words = array(); while (result)) { // since this list will be checked for each word, use term as the array // key-- isset(<term>]) is more efficient than using // in_array(<term>, stop_words) row['TERM_VALUE']] = true; } mysql_free_result(// open CURL handle for downloading ch = curl_init(); // set curl options curl_setopt(ch, CURLOPT_HEADER, false); curl_setopt(ch, CURLOPT_USERAGENT, 'Search Engine Indexer'); // fetch list of documents to index\ %sSEARCH_CRAWL', DB_TBL_PREFIX); result = mysql_query(GLOBALS['DB']); while (result)) { echo 'Processing: ' . // retrieve the document's content curl_setopt(ch, CURLOPT_URL, file = curl_exec(file = tidy_repair_string(html = simplexml_load_string(// or: html = @simplexml_load_string( // extact the title if (html->head->title) { html->head->title; } else { // use the filename if a title is not found row['DOCUMENT_URL']); } // extract the description html->head->meta as meta['name']) && description = // add the document to the index query = sprintf('INSERT INTO\ %sSEARCH_DOCUMENT (DOCUMENT_URL, ' . 'DOCUMENT_TITLE, DESCRIPTION) VALUES ("\ %s", "\ %s", "\ %s")', DB_TBL_PREFIX, mysql_real_escape_string(GLOBALS['DB']), mysql_real_escape_string(GLOBALS['DB']), mysql_real_escape_string(GLOBALS['DB'])); mysql_query(GLOBALS['DB']); // retrieve the document's id GLOBALS['DB']); // strip HTML tags out from the content file); // break content into individual words foreach (str_word_count(index => // words should be stored as lowercase for comparisons word = strtolower(// skip word if it appears in the stop words list if (isset(stop_words[// determine if the word already exists in the database query = sprintf('SELECT TERM_ID FROM\ %sSEARCH_TERM WHERE ' . 'TERM_VALUE = "\ %s"', DB_TBL_PREFIX, mysql_real_escape_string(GLOBALS['DB'])); query, result2)) { // word exists so retrieve its id list(result2); } else { // add word to the database\ %sSEARCH_TERM (TERM_VALUE) ' . 'VALUE ("\ %s")', DB_TBL_PREFIX, mysql_real_escape_string(word, query, // determine the word's id word_id = mysql_insert_id(result2); // add the index record\ %sSEARCH_INDEX (DOCUMENT_ID, ' . 'TERM_ID, OFFSET) VALUE (\ %d,\ %d,\ %d)', DB_TBL_PREFIX, doc_id, index); mysql_query(GLOBALS['DB']); } } mysql_free_result(ch); echo 'Indexing complete.' . "\n"; ?>
(C) Æliens 20/2/2008
You may not copy or print any of this material without explicit permission of the author or the publisher. In case of other copyright issues, contact the author.