Inverted 2

<?php include('stemmer.class.inc'); $docID = 0; //actual article number in sequence $textDocID = 0; //article number from TIME.
ALL $index = 0; //dictionary term index $dictionary = array(); $stopwords = array(" i ", " a ", " about ", " an ", " and ", " are ", " as ", " at ", " be ", " but ", " by ", " com ", " de ", " en ", " for ", " from ", " has ", " how ", " in ", " is ", " it ", " la ", " of ", " on ", " or ", " that ", " the ", " this ", " to ", " was ", " what ", " when ", " where ", " who ", " wil l ", " with ", " und ", " the ", " www "); $time_start = microtime(true); /////// $filename = "./test_archive/TIME.ALL"; $fileString = file_get_contents($filename); $fileString = preg_replace("/\n\r|\n|\r/"," ",$fileString); //remove line breaks $word = strtok($fileString, " "); //start tokenizing while ($word) { /* Format of file: docID date pageNum ^ ^ ^ *TEXT ### ##/##/## PAGE ### ... article contents ... *TEXT ### ##/##/## PAGE ### ... more articles... *STOP */ if (strcmp("*TEXT", $word) == 0) { //if article is found, process it $articleString = ""; $docID++; //increase sequential article number $word = strtok(" "); docID $textDocID = $word; //echo "\nStart of article: ".$textDocID." (".$docID.")\n"; for ($i=0; $i<=3; $i++) $word = strtok(" "); //skip the date and page number //next word is article number or
//keep reading words until the next article begins or the file ends while (strcmp("*TEXT", $word) != 0 && strcmp("*STOP", $word) != 0) { $cleanedString_Stage1 = preg_replace("/[^a-zA-Z0-9\s]/", "", strtolo wer($word)); //remove symbols if ($cleanedString_Stage1 != "") { $stemmedString = PorterStemmer::Stem($cleanedString_Stage1); //s tem the word $cleanedString_Stage2 = ' '.$stemmedString.' '; $cleanedString_Stage2 = str_replace($stopwords,"",$cleanedString _Stage2); //remove stopwords $cleanedString_Stage2 = trim($cleanedString_Stage2);
} if ($cleanedString_Stage2 != "") $articleString = $articleString." ".$cleanedString_Stage2; //do some clean-up before the next article $cleanedString_Stage1 = ""; $cleanedString_Stage2 = ""; $stemmedString = ""; $word = strtok(" "); }//end while //articleString now contains one article //this line gets the term frequency per article $article_word_count = (array_count_values(str_word_count($articleString, 1,"0..9"))); //create a dictionary containing: // 1. the term // 2. the article it appears in // 3. how many times it appears in that article foreach ($article_word_count as $key=>$val) { $dictionary[$index] = array( "term" => $key, "docID" => $textDocID, "freq" => $val ); $column[] = $dictionary[$index]["Term"]; //needed to sort the dictio nary later on $index++; } } else $word = strtok(" "); //if article is not found, keep tokenizing if (strcmp("*STOP", $word) == 0) //break out of loop at eof break; }//end while array_multisort($column, SORT_ASC, $dictionary); //sort dictionary alphabeticall y //now that we have an alphabetically sorted dictionary //we can create the lexicon and postings files $lexicon = array(); $postings = array(); $pointer = 0; //pointer to postings entry $myPostings = "postings.txt"; $fhPost = fopen($myPostings,'a'); foreach ($dictionary as $key=>$value) { $term = $dictionary[$key]["term"]; $docID = $dictionary[$key]["docID"]; $freq = $dictionary[$key]["freq"]; if (isset($lexicon[$term])) {
$lexicon[$term]["numDocs"] += 1; $lexicon[$term]["totalFreq"] += $freq; } else { $lexicon[$term]["numDocs"] = 1; $lexicon[$term]["totalFreq"] = $freq; $lexicon[$term]["pointer"] = $pointer; } $postingsData = $docID." ".$freq."\n"; fwrite($fhPost, $postingsData); $pointer++; } fclose($fhPost); //write lexicon to hd $myLexicon = "lexicon.txt"; $fhLex = fopen($myLexicon,'a'); foreach ($lexicon as $key=>$value) { $data = $key." ".$lexicon[$key]["numDocs"]." ".$lexicon[$key]["totalFreq"]." ".$lexicon[$key]["pointer"]."\n"; fwrite($fhLex, $data); } fclose($fhLex); ////// $time_end = microtime(true); $time = $time_end - $time_start; echo "\n\nExecution time: ".$time."\n"; ?>

Inverted 2

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Inverted 2

Uploaded by

Copyright:

Available Formats

<?php include('stemmer.class.inc'); $docID = 0; //actual article number in sequence $textDocID = 0; //article number from TIME.

You might also like