import Tkinter as tk import sys class ExampleApp(tk.Tk): def __init__(self): tk.Tk.__init__(self) toolbar = tk.Frame(self) toolbar.pack(side="top", fill="x") b1 = tk.Button(self, text="CLEANER", command=self.cleaner) b2 = tk.Button(self, text="filter tweets", command=self.filter_tweets) b3 = tk.Button(self, text="names", command=self.names) b4 = tk.Button(self, text="dates", command=self.dates) b5 = tk.Button(self, text="links", command=self.links) b6 = tk.Button(self, text="news media", command=self.newsmedia) b1.pack(in_=toolbar, side="top", fill="both") b2.pack(in_=toolbar, side="left") b3.pack(in_=toolbar, side="left") b4.pack(in_=toolbar, side="left") b5.pack(in_=toolbar, side="left") b6.pack(in_=toolbar, side="left") self.text = tk.Text(self, wrap="word") self.text.pack(side="top", fill="both", expand=True) self.text.tag_configure("stderr", foreground="#b22222") sys.stdout = TextRedirector(self.text, "stdout") sys.stderr = TextRedirector(self.text, "stderr") def cleaner(self): #remove links one = open(".../scraped_tweets.txt", "r") two = open(".../1.txt", "wb") for line in one: #for part in line.split(): #if "http://" in part: #line = line.replace(part, "") #elif "https://" in part: #line = line.replace(part, "") two.write("") two.write(line) one.close() two.close() #remove useless lines import os one = open(".../1.txt", "r") two = open(".../2.txt", "wb") for line in one: if "vind-ik-leuk" in line: continue elif "Beantwoorden" in line: continue elif "Als antwoord op" in line: continue elif "Leuk" in line: continue elif "Meer" in line: continue elif " Als" in line: continue elif "heeft geretweet" in line: continue elif "heeft toegevoegd" in line: continue elif "is niet beschikbaar" in line: continue else: two.write(line) one.close() two.close() os.remove(".../1.txt") #isolate name+date1 one = open(".../2.txt", "r") two = open(".../3.txt", "wb") for line in one: if ". 2015" in line: two.write("") two.write(line) else: two.write(line) one.close() two.close() os.remove(".../2.txt") #fuse lines one = open(".../3.txt", "r") two = open(".../4.txt", "wb") lines = one.readlines() new = "\t".join([line.strip() for line in lines]) two.write(new) one.close() two.close() os.remove(".../3.txt") #isolate name+date2 one = open(".../4.txt", "r") two = open(".../5.txt", "wb") for line in one: line = line.replace("", "") two.write(line) one.close() two.close() os.remove(".../4.txt") #isolate name+date3 one = open(".../5.txt", "r") two = open(".../6.txt", "wb") for line in one: line = line.replace("", "\n") two.write(line) one.close() two.close() os.remove(".../5.txt") #isolate name+date4 one = open(".../6.txt", "r") two = open(".../7.txt", "wb") for line in one: if "" in line: two.write(line) one.close() two.close() os.remove(".../6.txt") #write name+date and tweet in seperate lines1 one = open(".../7.txt", "r") two = open(".../8.txt", "wb") for line in one: line = line.replace(". 2015", ". 2015\n") two.write(line) one.close() two.close() os.remove(".../7.txt") #remove tabs one = open(".../8.txt", "r") two = open(".../9.txt", "wb") for line in one: line = line.replace("\t", "") two.write(line) one.close() two.close() os.remove(".../8.txt") #remove one = open(".../9.txt", "r") two = open(".../10.txt", "wb") for line in one: line = line.replace("", "") two.write(line) one.close() two.close() os.remove(".../9.txt") #write name+date and tweet in seperate lines2 one = open(".../10.txt", "r") two = open(".../11.txt", "wb") for line in one: if "" in line: two.write(line) else: two.write("") two.write(line) one.close() two.close() os.remove(".../10.txt") #fuse lines one = open(".../11.txt", "r") two = open(".../12.txt", "wb") lines = one.readlines() new = "\t".join([line.strip() for line in lines]) two.write(new) one.close() two.close() os.remove(".../11.txt") #isolate name+date5 one = open(".../12.txt", "r") two = open(".../13.txt", "wb") for line in one: line = line.replace("", "\n") two.write(line) one.close() two.close() os.remove(".../12.txt") #isolate name+date6 one = open(".../13.txt", "r") two = open(".../14.txt", "wb") for line in one: if "" in line: two.write(line) one.close() two.close() os.remove(".../13.txt") #isolate name+date7 one = open(".../14.txt", "r") two = open(".../15.txt", "wb") for line in one: two.write(line) two.write(" ") one.close() two.close() os.remove(".../14.txt") #demarcate name/date/tweet1 one = open(".../15.txt", "r") two = open(".../16.txt", "wb") for line in one: two.write("") two.write(line[line.find(" "):line.find("")-13]) two.write(" ") two.write(line[line.find("")-13:line.find("")]) two.write(" ") two.write(line[line.find("")+7:line.find("")+5]) two.write("\n") one.close() two.close() os.remove(".../15.txt") #demarcate name/date/tweet2 one = open(".../16.txt", "r") two = open(".../demarcated.txt", "wb") for line in one: line = line.replace("", "") two.write(line.lower()) one.close() two.close() os.remove(".../16.txt") print "-------------------------" print "\ncleaned tweets in demarcated.txt" print "\n-------------------------" def filter_tweets(self): #remove tweets from certain accounts and remove tweets shorter than 3 words one = open(".../demarcated.txt", "r") two = open(".../tweets_deleted.txt", "wb") three = open(".../tweets_final.txt", "wb") for line in one: name = line[line.find(""):line.find("")] tweet = line[line.find("")+7:line.find("")+5] if len(tweet.split())<3: two.write(line) #elif "verifieerd" in name: # two.write(line) #elif "news" in name: # two.write(line) #elif "magazine" in name: # two.write(line) #elif "@aiwwenglish" in name: # two.write(line) #elif "@aiww_en" in name: # two.write(line) #elif "@visitchina" in name: # two.write(line) #elif "meetchina" in name: # two.write(line) #elif "aitigre" in name: # two.write(line) #elif "freeaiwwblog" in name: # two.write(line) #elif "awwneversorry" in name: # two.write(line) #elif "freeaiww" in name: # two.write(line) #elif "@shower" in name: # two.write(line) #elif "@pangrui" in name: # two.write(line) #elif "@qidelong" in name: # two.write(line) #elif "@adolph" in name: # two.write(line) #elif "museum" in name: # two.write(line) #elif "m_laurette" in name: # two.write(line) #elif "gogopromo" in name: # two.write(line) #elif "???" in name: # two.write(line) #elif "russia" in name: # two.write(line) #elif "pussy_riot" in name: # two.write(line) #elif "pussyriot" in name: # two.write(line) #elif "jafarpanahi" in name: # two.write(line) #elif "jafar" in name: # two.write(line) else: three.write(line[line.find(""):line.find("")+5]) three.write("\n") one.close() two.close() three.close() print "\ndeleted tweets in tweets_deleted.txt" print "usable tweets in tweets_final.txt" print "\n-------------------------" def names(self): #isolate name1 import os one = open(".../tweets_final.txt", "r") two = open(".../names1.txt", "wb") for line in one: two.write(line[line.find("")+7:line.find(". 2015")-13]) two.write("\n") one.close() two.close() #isolate name2 one = open(".../names1.txt", "r") two = open(".../names2.txt", "wb") for line in one: list = line.strip().split() print>>two, list[-1:] one.close() two.close() os.remove(".../names1.txt") #find username one = open(".../names2.txt", "r") two = open(".../names.txt", "wb") for line in one: two.write(line[line.find("@"):line.find("]")-1]) two.write("\n") one.close() two.close() os.remove(".../names2.txt") from collections import Counter one = open(".../names.txt", "r") a = one.read() c = Counter(a.split()) one.close() print "\nnames in names.txt" print "\n25 most common names: \n", c.most_common(25) print "\n-------------------------" def dates(self): import os one = open(".../tweets_final.txt", "r") two = open(".../dates.txt", "wb") for line in one: two.write(line[line.find(". 2015")-6:line.find("")]) two.write("\n") one.close() two.close() one = open(".../dates.txt", "r") count1=0 count2=0 count3=0 count4=0 count5=0 count6=0 count7=0 count8=0 count9=0 count10=0 count11=0 count12=0 for line in one: if "dec." in line: count12+=1 elif "nov." in line: count11+=1 elif "okt." in line: count10+=1 elif "sep." in line: count9+=1 elif "aug." in line: count8+=1 elif "jul." in line: count7+=1 elif "jun." in line: count6+=1 elif "mei." in line: count5+=1 elif "apr." in line: count4+=1 elif "mrt." in line: count3+=1 elif "feb." in line: count2+=1 elif "jan." in line: count1+=1 one.close() two.close() os.remove(".../dates.txt") print "\njan. 2015: ", count1 print "feb. 2015: ", count2 print "mrt. 2015: ", count3 print "apr. 2015: ", count4 print "mei. 2015: ", count5 print "jun. 2015: ", count6 print "jul. 2015: ", count7 print "aug. 2015: ", count8 print "sep. 2015: ", count9 print "okt. 2015: ", count10 print "nov. 2015: ", count11 print "dec. 2015: ", count12 A = int(count1) + int(count2) + int(count3) + int(count4) + int(count5) + int(count6) + int(count7) + int(count8) + int(count9) + int(count10) + int(count11) + int(count12) print "\ntotal: ", A print "\n-------------------------" def links(self): #isolate links one = open(".../2015.txt", "r") two = open(".../links_all.txt", "wb") for line in one: for part in line.split(): if "http://" in part: two.write(part) two.write("\n") elif "https://" in part: two.write(part) two.write("\n") one.close() two.close() #isolate link core1 one = open(".../links_all.txt", "r") two = open(".../links1.txt", "wb") for line in one: line1 = line.replace("http://","").replace("https://","") two.write(line1) one.close() two.close() #isolate link core2 import os one = open(".../links1.txt", "r") two = open(".../links_core.txt", "wb") for line in one: line1 = line[line.find("")+7:line.find("/")] two.write(line1) two.write("\n") one.close() two.close() os.remove(".../links1.txt") #isolate unique links one = open(".../links_all.txt", "r") two = open(".../links_unique.txt", "wb") links = set() for line in one: if line not in links: two.write(line) links.add(line) else: continue one.close() two.close() from collections import Counter one = open(".../links_core.txt", "r") b = one.read() v = Counter(b.split()) print "\nall links in links_all.txt" print "all unique links in links_unique.txt" print "link frequencies in links_frequencies.txt" print "\n25 most common websites: \n", v.most_common(25) print "\n-------------------------" one.close() def newsmedia(self): one = open(".../demarcated.txt", "r") two = open(".../verified.txt", "wb") for line in one: if "geverifieerd" in line: two.write(line) one.close() two.close() import os one = open(".../verified.txt", "r") two = open(".../newsmedia.txt", "wb") for line in one: name = line[line.find(""):line.find("")] if "news" in name: two.write(line) elif "magazine" in name: two.write(line) elif "nyt" in name: two.write(line) elif "huff" in name: two.write(line) elif "guardian" in name: two.write(line) elif "television" in name: two.write(line) elif "observer" in name: two.write(line) elif "times" in name: two.write(line) elif "post" in name: two.write(line) one.close() two.close() os.remove(".../verified.txt") one = open(".../newsmedia.txt", "r") two = open(".../newslinks.txt", "wb") for line in one: for part in line.split(): if "http" in part: two.write(part) two.write("\n") one.close() two.close() print "-------------------------" print "\n news media in newsmedia.txt" print "\n news media links in newslinks.txt" print "\n-------------------------" class TextRedirector(object): def __init__(self, widget, tag="stdout"): self.widget = widget self.tag = tag def write(self, str): self.widget.configure(state="normal") self.widget.insert("end", str, (self.tag,)) self.widget.configure(state="disabled") app = ExampleApp() app.geometry("630x750") app.title("twitter data processor") app.mainloop()