So we want to give a search engine to our users. This sounds really simple, we could try:
$search_term_esc = AddSlashes($search_term); $sql = "SELECT * FROM Content WHERE content_body LIKE '%$search_term_esc%'";
- $search_term_esc = AddSlashes($search_term);
- $sql = "SELECT * FROM Content WHERE content_body LIKE '%$search_term_esc%'";
$sanitized_search=(isset($_GET['search'])&&($sanitized_search==""))?trim(strip_tags(addslashes($_GET['search']))):""; # 2 functions to split up search terms and accepting double-quotes as phrase makers. function search_transform_term($term){ $term = preg_replace("/(\s)/e", "'{WHITESPACE-'.ord('\$1').'}'", $term); $term = preg_replace("/,/", "{COMMA}", $term); return $term; } function search_split_terms($terms){ $terms = preg_replace("/\"(.*?)\"/e", "search_transform_term('\$1')", $terms); $terms = preg_split("/\s+|,/", $terms); $out = array(); foreach($terms as $term){ $term = preg_replace("/\{WHITESPACE-([0-9]+)\}/e", "chr(\$1)", $term); $term = preg_replace("/\{COMMA\}/", ",", $term); $out[] = $term; } return $out; } # prepare for MySQL regular expressions function search_escape_rlike($string){ return preg_replace("/([.\[\]*^\$])/", '\\\$1', $string); } # function to list terms into list of regular expressions # eg. search for 'foo' matches 'a foo a' but not 'a food a'. function search_db_escape_terms($terms){ $out = array(); foreach($terms as $term){ $out[] = '[[::]]'; } return $out; } # function to organise ranking function search_rx_escape_terms($terms){ $out = array(); foreach($terms as $term){ $out[] = '\b'.preg_quote($term, '/').'\b'; } return $out; } # function to sort results by rank. # NOTE: does not seem to be working properly function search_sort_results($a, $b){ $ax = $a["Score"]; $bx = $b["Score"]; if ($ax == $bx){ return 0; } return ($ax > $bx) ? -1 : 1; } # get the words we're going to display function search_return_keywords($str) { $search = array ( '@]*?>.*?@si', // Strip out javascript '@]*?>.*?@si', // Strip out Inline Stylesheets '@[{].*?[}]@si', // Strip out Internal Joomla Curly Brackets commands? '@@si', // Strip out HTML tags '@([\r\n])[\s]+@', // Strip out white space '@(\d+);@e' ); $str = preg_replace($search, ' ', $str); $str = strip_tags(str_replace(array('[',']'), array(''), $str)); // remove BB Code and HTML tags $str = html_entity_decode( $str, ENT_QUOTES, "utf-8" ); // converts html entities into characters (symbols) $str = preg_replace("/&.{0,}?;/",'', $str); // removes numeric entities? $str = preg_replace('//i', '', $str); // removes MS Office style html comments (and everything in between) $str = str_replace(array("\r\n", "\r", "\n", "\t"), ' ', $str); // remove carriage returns, line feeds, tabs and double-spaces # for mediawiki $str = str_replace('=', ' ', $str); // replace any equal signs with a single-space $str = str_replace('"', ' ', $str); // replace any double-quote signs with a single-space $str = str_replace("'", ' ', $str); // replace any apostrophe signs with a single-space $str = str_replace("#", ' ', $str); // replace any hash signs with a single-space $str = str_replace("_", ' ', $str); // replace any hash signs with a single-space $str = str_replace("*", ' ', $str); // replace any asterisk signs with a single-space $str = str_replace("(", ' ', $str); // replace any open parenthesis with a single-space $str = str_replace(")", ' ', $str); // replace any closing parenthesis with a single-space $str = str_replace("\\", ' ', $str); // replace any closing parenthesis with a single-space # to clean up $str = str_replace(' ', ' ', $str); // replace any double-spaces with a single-space $str = str_replace(' ', ' ', $str); // replace any double-spaces with a single-space return $str; } # highlight terms within search results function search_highlight($text, $terms_rx){ $start = '(^|)'; $end = '($|)'; return preg_replace( "/$start(.*?)$end/se", "StripSlashes('\\1').". "search_highlight_inner(StripSlashes('\\2'), \$terms_rx).". "StripSlashes('\\3')", $text ); } # highlight terms within search results function search_highlight_inner($text, $terms_rx){ $colors = search_get_highlight_colors(); foreach($terms_rx as $term_rx){ $color = array_shift($colors); $text = preg_replace( "/($term_rx)/ise", "search_highlight_do(StripSlashes('\\1'), \$color)", $text ); } return $text; } # determine colors to use function search_get_highlight_colors(){ return array( array('#ffffcc','#CA6C17'), array('#EEE823','#000000'), array('#2EBDB6','#000000'), array('#7570B3','#000000'), array('#FFD400','#000000'), array('#ED017C','#ffffff'), array('#7A003F','#ffffff'), array('#7C6A55','#ffffff'), array('#0084B6','#ffffff'), array('#260357','#ffffff'), ); } function search_highlight_do($fragment, $color){ return "". "$fragment"; } function search_pretty_terms_highlighted($terms_html){ $colors = search_get_highlight_colors(); $temp = array(); foreach($terms_html as $term_html){ $color = array_shift($colors); $temp[] = search_highlight_do($term_html, $color); } return search_pretty_terms($temp); } # separate function for dealing with apostrophes (injection?) function search_final_clean($terms) { $out = array(); foreach($terms as $term){ $out[] = addslashes($term); } return $out; } $terms = search_split_terms($terms); $out=array(); foreach($terms as $term) { # remove terms that are not long enough (so no searches for the letter "e") # remove terms that are in the omit word database if ((!in_array($term, $omit_search_words))&&(strlen($term)>=$MinimimumStringLength)) { $omitted_words_array[]=$term; $out[]=$term; } } $terms=array(); $terms=$out; # start creating the sql query $terms_db = search_db_escape_terms($terms); $terms_db_sql = search_final_clean($terms); $parts = array(); foreach($terms_db as $term_db){ $parts[] = "PageTitle RLIKE '$term_db'"; } $parts1 = implode(' '.strtoupper($select_joiner).' ', $parts); $parts = array(); foreach($terms_db as $term_db){ $parts[] = "PageContent RLIKE '$term_db'"; } $parts2 = implode(' '.strtoupper($select_joiner).' ', $parts); $parts = $parts1 . " OR " . $parts2; $scoreparts = array(); foreach($terms_db_sql as $term){ $scoreparts[] = "(((SELECT LENGTH(t.PageTitle)-length(replace(LOWER(t.PageTitle),LOWER('$term'),'')))/LENGTH('$term'))*10)"; } foreach($terms_db_sql as $term){ $scoreparts[] = "((SELECT LENGTH(t.PageContent)-length(replace(LOWER(t.PageContent),LOWER('$term'),'')))/LENGTH('$term'))"; } if (count($terms)==0) { $terms=array(); $terms[]=strtolower($sanitized_search); $terms_db = search_db_escape_terms($terms); $terms_db_sql = bukb_apostrophes($terms); $term_db=$terms_db_sql[0]; $scorepartstring = "(((SELECT LENGTH(t.PageTitle)-length(replace(LOWER(t.PageTitle),LOWER('$term'),'')))/LENGTH('$term'))*10)"; $parts="(PageTitle RLIKE '[[::]]')".$parts."(PageContent RLIKE '[[::]]')"; } else { $scorepartstring = implode(' + ', $scoreparts); } // Generate the SQL query for your search engine $filter_sql = " SELECT t.PageID, t.PageTitle, t.PageContent, t.Source, ".$scorepartstring." AS Score FROM (".$live_sub_query.") AS t WHERE ".$parts." ORDER BY Score DESC LIMIT 0, ".$maximum_number_of_results_from_db; $terms_rx = search_rx_escape_terms($terms); $results_rows = array(); $result = mysql_query($filter_sql); if ($result) { while($row=mysql_fetch_assoc($result)) { $row["Score"] = 0; foreach($terms_rx as $term_rx){ $row["Score"] += preg_match_all("/$term_rx/i", $row["PageTitle"], $null)*10; $row["Score"] += preg_match_all("/$term_rx/i", $row["PageContent"], $null); } $results_rows[] = $row; } } uasort($results_rows, 'search_sort_results'); $terms_rx = search_rx_escape_terms($terms); $display_excerpt=search_highlight($result_Content, $terms_rx)."...";
- $sanitized_search=(isset($_GET['search'])&&($sanitized_search==""))?trim(strip_tags(addslashes($_GET['search']))):"";
- # 2 functions to split up search terms and accepting double-quotes as phrase makers.
- function search_transform_term($term){
- $term = preg_replace("/(\s)/e", "'{WHITESPACE-'.ord('\$1').'}'", $term);
- $term = preg_replace("/,/", "{COMMA}", $term);
- return $term;
- }
- function search_split_terms($terms){
- $terms = preg_replace("/\"(.*?)\"/e", "search_transform_term('\$1')", $terms);
- $terms = preg_split("/\s+|,/", $terms);
- $out = array();
- foreach($terms as $term){
- $term = preg_replace("/\{WHITESPACE-([0-9]+)\}/e", "chr(\$1)", $term);
- $term = preg_replace("/\{COMMA\}/", ",", $term);
- $out[] = $term;
- }
- return $out;
- }
- # prepare for MySQL regular expressions
- function search_escape_rlike($string){
- return preg_replace("/([.\[\]*^\$])/", '\\\$1', $string);
- }
- # function to list terms into list of regular expressions
- # eg. search for 'foo' matches 'a foo a' but not 'a food a'.
- function search_db_escape_terms($terms){
- $out = array();
- foreach($terms as $term){
- $out[] = '[[::]]';
- }
- return $out;
- }
- # function to organise ranking
- function search_rx_escape_terms($terms){
- $out = array();
- foreach($terms as $term){
- $out[] = '\b'.preg_quote($term, '/').'\b';
- }
- return $out;
- }
- # function to sort results by rank.
- # NOTE: does not seem to be working properly
- function search_sort_results($a, $b){
- $ax = $a["Score"];
- $bx = $b["Score"];
- if ($ax == $bx){ return 0; }
- return ($ax > $bx) ? -1 : 1;
- }
- # get the words we're going to display
- function search_return_keywords($str) {
- $search = array (
- '@]*?>.*?@si',  // Strip out javascript
- '@]*?>.*?@si',  // Strip out Inline Stylesheets
- '@[{].*?[}]@si',  // Strip out Internal Joomla Curly Brackets commands?
- '@@si',  // Strip out HTML tags
- '@([\r\n])[\s]+@',  // Strip out white space
- '@&#(\d+);@e'
- );
- $str = preg_replace($search, ' ', $str);
- $str = strip_tags(str_replace(array('[',']'), array(''), $str));  // remove BB Code and HTML tags
- $str = html_entity_decode( $str, ENT_QUOTES, "utf-8" );  // converts html entities into characters (symbols)
- $str = preg_replace("/&.{0,}?;/",'', $str);  // removes numeric entities?
- $str = preg_replace('//i', '', $str);  // removes MS Office style html comments (and everything in between)
- $str = str_replace(array("\r\n", "\r", "\n", "\t"), ' ', $str);  // remove carriage returns, line feeds, tabs and double-spaces
- # for mediawiki
- $str = str_replace('=', ' ', $str);  // replace any equal signs with a single-space
- $str = str_replace('"', ' ', $str);  // replace any double-quote signs with a single-space
- $str = str_replace("'", ' ', $str);  // replace any apostrophe signs with a single-space
- $str = str_replace("#", ' ', $str);  // replace any hash signs with a single-space
- $str = str_replace("_", ' ', $str);  // replace any hash signs with a single-space
- $str = str_replace("*", ' ', $str);  // replace any asterisk signs with a single-space
- $str = str_replace("(", ' ', $str);  // replace any open parenthesis with a single-space
- $str = str_replace(")", ' ', $str);  // replace any closing parenthesis with a single-space
- $str = str_replace("\\", ' ', $str);  // replace any closing parenthesis with a single-space
- # to clean up
- $str = str_replace(' ', ' ', $str);  // replace any double-spaces with a single-space
- $str = str_replace(' ', ' ', $str);  // replace any double-spaces with a single-space
- return $str;
- }
- # highlight terms within search results
- function search_highlight($text, $terms_rx){
- $start = '(^|)';
- $end = '($|)';
- return preg_replace(
- "/$start(.*?)$end/se",
- "StripSlashes('\\1').".
- "search_highlight_inner(StripSlashes('\\2'), \$terms_rx).".
- "StripSlashes('\\3')",
- $text );
- }
- # highlight terms within search results
- function search_highlight_inner($text, $terms_rx){
- $colors = search_get_highlight_colors();
- foreach($terms_rx as $term_rx){
- $color = array_shift($colors);
- $text = preg_replace(
- "/($term_rx)/ise",
- "search_highlight_do(StripSlashes('\\1'), \$color)",
- $text );
- }
- return $text;
- }
- # determine colors to use
- function search_get_highlight_colors(){
- return array(
- array('#ffffcc','#CA6C17'),
- array('#EEE823','#000000'),
- array('#2EBDB6','#000000'),
- array('#7570B3','#000000'),
- array('#FFD400','#000000'),
- array('#ED017C','#ffffff'),
- array('#7A003F','#ffffff'),
- array('#7C6A55','#ffffff'),
- array('#0084B6','#ffffff'),
- array('#260357','#ffffff'),
- );
- }
- function search_highlight_do($fragment, $color){
- return "".
- "$fragment";
- }
- function search_pretty_terms_highlighted($terms_html){
- $colors = search_get_highlight_colors();
- $temp = array();
- foreach($terms_html as $term_html){
- $color = array_shift($colors);
- $temp[] = search_highlight_do($term_html, $color);
- }
- return search_pretty_terms($temp);
- }
- # separate function for dealing with apostrophes (injection?)
- function search_final_clean($terms) {
- $out = array();
- foreach($terms as $term){
- $out[] = addslashes($term);
- }
- return $out;
- }
- $terms = search_split_terms($terms);
- $out=array();
- foreach($terms as $term) {
- # remove terms that are not long enough (so no searches for the letter "e")
- # remove terms that are in the omit word database
- if ((!in_array($term, $omit_search_words))&&(strlen($term)>=$MinimimumStringLength)) {
- $omitted_words_array[]=$term;
- $out[]=$term;
- }
- }
- $terms=array();
- $terms=$out;
- # start creating the sql query
- $terms_db = search_db_escape_terms($terms);
- $terms_db_sql = search_final_clean($terms);
- $parts = array();
- foreach($terms_db as $term_db){
- $parts[] = "PageTitle RLIKE '$term_db'";
- }
- $parts1 = implode(' '.strtoupper($select_joiner).' ', $parts);
- $parts = array();
- foreach($terms_db as $term_db){
- $parts[] = "PageContent RLIKE '$term_db'";
- }
- $parts2 = implode(' '.strtoupper($select_joiner).' ', $parts);
- $parts = $parts1 . " OR " . $parts2;
- $scoreparts = array();
- foreach($terms_db_sql as $term){
- $scoreparts[] = "(((SELECT LENGTH(t.PageTitle)-length(replace(LOWER(t.PageTitle),LOWER('$term'),'')))/LENGTH('$term'))*10)";
- }
- foreach($terms_db_sql as $term){
- $scoreparts[] = "((SELECT LENGTH(t.PageContent)-length(replace(LOWER(t.PageContent),LOWER('$term'),'')))/LENGTH('$term'))";
- }
- if (count($terms)==0) {
- $terms=array();
- $terms[]=strtolower($sanitized_search);
- $terms_db = search_db_escape_terms($terms);
- $terms_db_sql = bukb_apostrophes($terms);
- $term_db=$terms_db_sql[0];
- $scorepartstring = "(((SELECT LENGTH(t.PageTitle)-length(replace(LOWER(t.PageTitle),LOWER('$term'),'')))/LENGTH('$term'))*10)";
- $parts="(PageTitle RLIKE '[[::]]')".$parts."(PageContent RLIKE '[[::]]')";
- } else {
- $scorepartstring = implode(' + ', $scoreparts);
- }
- // Generate the SQL query for your search engine
- $filter_sql = "
- SELECT
- t.PageID,
- t.PageTitle,
- t.PageContent,
- t.Source,
- ".$scorepartstring."
- as Score
- FROM (".$live_sub_query.") as t
- WHERE ".$parts."
- ORDER BY Score DESC
- LIMIT 0, ".$maximum_number_of_results_from_db;
- $terms_rx = search_rx_escape_terms($terms);
- $results_rows = array();
- $result = mysql_query($filter_sql);
- if ($result) {
- while($row=mysql_fetch_assoc($result)) {
- $row["Score"] = 0;
- foreach($terms_rx as $term_rx){
- $row["Score"] += preg_match_all("/$term_rx/i", $row["PageTitle"], $null)*10;
- $row["Score"] += preg_match_all("/$term_rx/i", $row["PageContent"], $null);
- }
- $results_rows[] = $row;
- }
- }
- uasort($results_rows, 'search_sort_results');
- $terms_rx = search_rx_escape_terms($terms);
- $display_excerpt=search_highlight($result_Content, $terms_rx)."...";
$didyoumean_words_array=array(); foreach ($terms as $term) { $term=strtolower($term); # get closest word by LEVENSHTEIN $shortest = -1; foreach ($levenshtein_words as $word) { $lev = levenshtein($term, $word); if ($lev == 0) { $closest = $word; $shortest = 0; break 1; } if ($lev 0)?"Did you mean: ".$didyoumean_string_joined."?":""; $didyoumean_words_array=array(); // Note: for some systems you can do a levenshtein on two words soundex value and compare these. // Be warned that it does go off track though in terms of relevance...
- $didyoumean_words_array=array();
- foreach ($terms as $term) {
- $term=strtolower($term);
- # get closest word by LEVENSHTEIN
- $shortest = -1;
- foreach ($levenshtein_words as $word) {
- $lev = levenshtein($term, $word);
- if ($lev == 0) {
- $closest = $word;
- $shortest = 0;
- break 1;
- }
- if ($lev 0)?"Did you mean: ".$didyoumean_string_joined."?":"";
- $didyoumean_words_array=array();
- // Note: for some systems you can do a levenshtein on two words soundex value and compare these.
- // Be warned that it does go off track though in terms of relevance...