Professional Documents
Culture Documents
Web Scraping
Web Scraping
*;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
class Main {
public static void main(String[] args) throws IOException {
String[] stopWords = {
"a", "
as", "able", "about", "above", "according", "accordingly",
"across", " actually", "after", "afterwards", "again", "against", "aint",
"all", "allow", "allows", "almost", "alone", "along", "
already", "also",
"although", "always", "am", "among", "amongst", "an", " and", "another",
"any", "anybody", "anyhow", "anyone", "anything", "anyway", "
anyways",
"anywhere", "apart", "appear", "appreciate", "appropriate", " are",
"arent", "around", "aside", "ask", "asking", "associated", "
at",
"available", "away", "awfully", "be", "became", "because", " become",
becoming", "been", "before", "
"becomes", " beforehand", "behind", "being",
"believe", " below", "beside", "besides", " best", "better", "between",
"beyond", "both", "
brief", "but", "by", "cmon", "cs", "came", "can",
"cant", "cannot", " cant", "cause", "causes", "certain", "certainly",
"changes", "clearly", "co", "com", "come", "comes", "concerning",
"consequently", "consider", "considering", "contain", "containing",
"contains", "corresponding", "could", "couldnt", "course", "currently",
"definitely", "described", " despite", "did", "didnt", "different", "do",
"does", "doesnt", "doing", " dont", "done", "down", "downwards", "during",
"each", "edu", "eg", "eight", "either", "else", "elsewhere", "enough",
"entirely", "especially", "et", "etc", "even", "ever", "every",
"everybody", "everyone", "everything", "everywhere", "ex", " exactly",
"example", "except", "far", "few", "ff", "fifth", "first", " five",
"followed", "following", "follows", "for", "former", "formerly", "forth",
"four", "from", "further", "furthermore", "get", "gets", "getting",
"given", "gives", "go", "goes", "going", "gone", "got", "gotten",
"greetings", "had", "hadnt", "happens", "hardly", "has", "hasnt", "have",
"havent", "having", "he", "hes", "hello", "
help", "hence", "her", "here",
"heres", "hereafter", "hereby", "herein", " hereupon", "hers", "herself",
"hi", "him", "himself", "his", "hither", "hopefully", " how", "howbeit",
"however", "i", "id", "ill", "im", "ive", "ie", "if", " ignored",
in", "inasmuch", "inc", "indeed", "indicate", "indicated",
"immediate", "
"indicates", " inner", "insofar", "instead", "into", "inward", "is",
"isnt", "it", "itd", "itll", "its", "its", "
itself", "just", " keep",
"keeps", "kept", "know", "knows", "known", " last", "lately", " later",
"latter", "latterly", "least", "less", "lest", "let", "lets", "like",
"liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly",
may", "maybe", "me", "mean", "
"many", " meanwhile", "merely", "might",
"more", " moreover", "most", "mostly", " much", "must", "my", "myself",
"name", "namely", "nd", "near", "nearly", "necessary", "need", "needs",
"neither", "never", "nevertheless", "new", "next", "nine", "no", "nobody",
none", "noone", "nor", "
"non", " normally", "not" , "nothing", "novel",
"now", " nowhere", "obviously", " of", "off", "often", "oh", "ok", "okay",
"old", "on", "once", "one", "ones", "only", "onto", "or", "other",
"others", "otherwise", "ought", "our", "ours", "ourselves", "out",
over", "overall", "own", "particular", "
"outside", " particularly", "per",
"perhaps", " placed", "please", "plus", "possible", " presumably",
"probably", "provides", "que", "quite", "qv", "rather", "rd", "re",
"really", "reasonably", "regarding", "regardless", "regards",
"relatively", "respectively", "right", "said", "same", "saw", "say",
"saying", "says", "second", "
secondly", "see", "seeing", "seem", "seemed",
"seeming", "seems", "seen", " self", "selves", "sensible", "sent",
"serious", "seriously", "seven", "several", "shall", "she", "should",
"shouldnt", "since", "six", "so", "some", "somebody", "somehow",
"someone", "something", "sometime", "sometimes", "somewhat", "somewhere",
"soon", "sorry", "specified", "specify", "specifying", "still", "sub",
sup", "sure", "ts", "take", "taken", "tell", "tends", "th",
"such", "
"than", " thank", "thanks", "thanx", "that", "thats", "thats", "the",
"their", "theirs", "them", "themselves", "then", "thence", "
there",
"theres", "thereafter", "thereby", "therefore", "therein", " theres",
"thereupon", "these", "they", "theyd", "theyll", "theyre", "theyve",
"think", "third", "this", "thorough", "thoroughly", "those", "though",
"three", "through", "throughout", "thru", "thus", "to", "
together", "too",
"took", "toward", "towards", "tried", "tries", "truly", " try", "trying",
two", "un", "under", "
"twice", " unfortunately", "unless", "unlikely",
"until", " unto", "up", "upon", " us", "use", "used", "useful", "uses",
"using", "usually", "value", "various", "very", "via", "viz", "vs",
wants", "was", "wasnt", "way", "we", "wed", "
"want", " well", " were",
"weve", " welcome", "well", "went", "were", "werent", " what", " whats",
"whatever", "when", "whence", "whenever", "where", "wheres", "whereafter",
"whereas", "whereby", "wherein", "whereupon", "wherever", "whether",
while", "whither", "who", "whos", "whoever", "whole", "whom",
"which", "
"whose", " why", "will", "willing", "wish", "with", "within", "without",
wonder", "would", "would", "wouldnt", "yes", "yet", "you",
"wont", "
"youd", " youll", "youre", "youve", "your", "yours", "yourself",
"yourselves", "zero", "x", "s"};
String[] stopWordsExtra = {
"\'ll","\'s", "\'m","n\'t", "\'re", "\'ve", "\'d", "’ll","n’t", "’s",
"’m", "’re", "’ve", " ’d", "s\'", "s’", "\'", "\"", "“", "”", "’", "©",
"℗", "®", "™", "•", "·", "–", "◉"};
String[] positiveWords = {
able", "
" accepting", "active", "addition", "admirable", "adorable",
"affirming", " ageless", "agreeable", "abundant", "accomplished",
"accurate", "adaptable", "agile", "alert", "ambitious", "appreciative",
"attentive", "aware", "authentic", "attactive", "affectionate", "amazing",
"awesome", "amusing", "beautiful", "beloved", "benficial", "benevolent",
"best", "better", "blessed", "blissful", "blooming", "blossoming", "bold",
"brilliant", "brave", "caring", "cute", "creative", "calm", "capable",
"certain", "challenging", "charming", "choice", "clean", "comfortable",
"charitable", "careful", "cool", "charitable", "cheerful", "clear",
"committed", "competent", "concentration", "compassionate", "confident",
"consistent", "convincing", "courageous", "courteous", "cooperative",
"curious", "considerate", "desirable", "decent", "delicate", "delicious",
"dreamy", "dynamic", "daring", "dude!", "delightful", "dependable",
"desirable", "devoted", "determined", "diligent", "disciplined",
"diverse", "drive", "dazzling", "divine", "excellent", "educated",
empathetic", "easy", "
"efficient", " enabling", "energetic", "engaging",
"enjoyable", " eager", "effective", " elated", "elegant", "encouraging",
"enthusiastic", "exciting", "experienced", "expert", "
explorer",
"expressive", "enlightened", "exalted", "empowered", " exhilirating",
"engrossing", "ecstatic", "entrancing", "enlivened", "fantastic",
"fabulous", "fair", "faithful", "famous", "favorite", "flexible",
"focused", "flourishing", "forgiving", "free", "fun", "frugal",
"friendly", "fascinating", "fulfilled", "foody", "feisty", "festive",
"good", "glowing", "generous", "genius", "genuine", "giving", "grace",
"gratitude", "growing", "grounded", "glorious", "groovy", "giddy", "glad",
"hopeful", "hot", "happy", "harmonious", "healthy", "helpful", "honest",
"humorous", "human", "hero", "holy", "honesty", "honorable", "hospitable",
"humble", "halo", "imaginative", "inspiring", "ideal", "incredible",
"interesting", "innovative", "improving", "imaginative", "independent",
"ingenius", "insightful", "inspiring", "integrity", "intelligent",
"involved", "inclusive", "intriguing", "intuitive", "joyful", "jokey",
"jolly", "jovial", "just", "jazzy", "jaunty", "jubilant", "junior",
"jumpy", "juvenile", "kind", "killer", "keen", "knowledgable", "kudos",
"kitschy", "kindred", "kool", "loving", "learner", "laugh", "leader",
lucky", "light", "loyal", "louable", "luxurious", "lively",
"logical", "
"likable", " magnificent", "meaningful", "majestic", "marvelous",
"motivating", "miraculous", "magic", "masterful", "mindful", "modest",
"merciful", "mellow", "nice", "noble", "neat", "new", "nurturing",
"noisy", "normal", "noteworthy", "novel", "nutty", "
outstanding",
"optimistic", "original", "obedient", "organized", " perfect", "positive",
"peaceful", "paradisiacal", "
passionate", "powerful", " prepared",
"perceptive", "persistent", " pleasing", "prosperous", " playful",
"present", "quality", "quiet", "quaint", "qualified", "quick",
"respectful", "radiant", "ready", "rockin", "relaxing", "remarkable",
"rational", "respectful", "responsible", "resourceful", "romantic",
"righteous", "resilient", "rad", "soulmate", "special", "selfless",
"secure", "safe", "sincere", "stylish", "sympathetic", "strong",
"sparkly", "sunshiney", "spontaneous", "sweet", "supportive", "true",
"teachable", "trusting", "thankful", "timely", "tranquil", "tender",
"thrilling", "ticklish", "unique", "uplifting", "ultimate",
"unconditional", "upgrade", "useful", "unifying", "understanding",
"valuable", "virtuous", "valid", "viable", "victorious", "vibrant",
"worthy", "wild", "wacky", "wonderful", "winner", "welcome", "witty",
"wholesome", "yahoo", "yodeler", "yolo", "zesty", "zealous", "zany",
"zippy", "zoomy", "zingy", "zamazing" };
String[] negativeWords = {
abysmal", "adverse", "alarming", "angry", "annoy", "anxious",
"
"apathy", "appalling", "atrocious", "awful","bad", "banal", "barbed",
"belligerent", "bemoan", "beneath", " boring", "broken", "callous",
"cant", "clumsy", "coarse", "cold", " coldhearted", "collapse", "confused",
"contradictory", "contrary", "corrosive", "corrupt", "crazy", " creepy",
"criminal", "cruel", "cry", "cutting", "damage", "damaging", " dastardly",
"dead", "decaying", "deformed", "deny", "deplorable", "depressed",
"deprived", "despicable", "detrimental", "dirty", "disease", "disgusting",
"disheveled", "dishonest", "dishonorable", "dismal", "distress", "dont",
"dreadful", "dreary", "enraged", "eroding", "evil", "fail", "faulty",
"fear", "feeble", "fight", "filthy", "foul", "frighten", "frightful",
"gawky", "ghastly", "grave", "greed", "grim", "
grimace", "gross",
"grotesque", "gruesome", "guilty", "haggard", " hard", "hardhearted",
hate", "
"harmful", " hideous", " homely", "horrendous", "horrible",
"hostile", " hurt", " hurtful", " icky", "ignorant", "ignore", "ill",
"immature", "imperfect", "impossible", "inane", "inelegant", "infernal",
"injure", "injurious", "insane", "insidious", "insipid", "jealous",
lose", "lousy", "lumpy", "malicious", "mean", "menacing",
"junky", "
"messy", " misshapen", "missing", "misunderstood", "moan", "moldy",
"monstrous", "naive", "nasty", "
naughty", "negate", "negative", "never",
"no", "nobody", "nondescript", " nonsense", "not", "noxious",
"objectionable", "odious", "offensive", "old", "oppressive", "pain",
"perturb", "pessimistic", "petty", "plain", "poisonous", "poor",
"prejudice","questionable", "quirky", "quit", "reject", "renege",
"repellant", "reptilian", "repugnant", "repulsive", "revenge",
"revolting", "rocky", "rotten", "rude", "ruthless", "sad", "savage",
"scare", "scary", "scream", "severe", "shocking", "shoddy", "sick",
"sickening", "sinister", "slimy", "smelly", "sobbing", "sorry",
"spiteful", "sticky", "stinky", "stormy", "stressful", "stuck", "stupid",
"substandard", "suspect", "suspicious", "tense", "terrible", "terrifying",
"threatening", "ugly", "
undermine", "unfair", "unfavorable", "unhappy",
"unhealthy", "unjust", " unlucky", "unpleasant", "unsatisfactory",
"unsightly", "untoward", "unwanted", "unwelcome", " unwholesome",
"unwieldy", "unwise", "upset", "vice", "vicious", " vile", "villainous",
"vindictive", "wary", "weary", "wicked", "woeful", "worthless",
"wound","yell", "yucky","zero"};
//Variables
//implementation
scanner.close();
URLs.add(page.attr("abs:href"));
}
}
}
//start
unprocessedText = doc.text();
String title =
doc.title().toLowerCase().replaceAll("\\p{Punct}","").replaceAll("\\w*\\d\
\w* *", "");
String[] titleWord = title.split(" ");
//text clean
text1 = text1.trim()
.replaceAll("\\p{Punct}","")
.replaceAll("\\w*\\d\\w* *", "")
.replaceAll("[^\\x00-\\x7F]", "");
//Text count
if (component.contains(nextWord)){
nt i = component.indexOf(nextWord);
i
componentHz.set(i, componentHz.get(i) + 1);
}
else{
component.add(nextWord);
componentHz.add(1);
}
}
/analysis
/
int positiveCount = 0;
for (String y: positiveWords){
if (component.contains(y)){
positiveCount += componentHz.get(component.indexOf(y));
}
}
System.out.println(component.get(componentHz.indexOf(Collections.max(compo
nentHz))) + " " + Collections.max(componentHz) );
component.remove(componentHz.indexOf(Collections.max(componentHz)));
componentHz.remove(componentHz.indexOf(Collections.max(componentHz)));
}
}
}