import tkinter as tk import sys import os import re from collections import Counter ########## to consider before running ########## """ For cleaner check: - block 4: JOURNAL-CODE is not always given. - block 6: change artist """ class ExampleApp(tk.Tk): def __init__(self): tk.Tk.__init__(self) toolbar = tk.Frame(self) toolbar.pack(side="top", fill="x") b1 = tk.Button(self, text="cleaner", command=self.cleaner) b2 = tk.Button(self, text="print text", command=self.print_text) b3 = tk.Button(self, text="appearances", command=self.appearances) b4 = tk.Button(self, text="content analysis", command=self.content_analysis) b5 = tk.Button(self, text="artist presence", command=self.presence) b6 = tk.Button(self, text="import ready", command=self.import_ready) b1.pack(in_=toolbar, side="top", fill="both") b2.pack(in_=toolbar, side="top", fill="both") b3.pack(in_=toolbar, side="top", fill="both") b4.pack(in_=toolbar, side="top", fill="both") b5.pack(in_=toolbar, side="top", fill="both") b6.pack(in_=toolbar, side="top", fill="both") self.text = tk.Text(self, wrap="word") self.text.pack(side="top", fill="both", expand=True) self.text.tag_configure("stderr", foreground="#b22222") sys.stdout = TextRedirector(self.text, "stdout") sys.stderr = TextRedirector(self.text, "stderr") def cleaner(self): #block1: join the lines one = open("…/start.txt", "r") two = open("…/1.txt", "w") with open("…/start.txt") as one: print(" ".join(line.strip() for line in one), sep = "", file=two) one.close() two.close() #block 2: add one = open("…/1.txt", "r") two = open("…/2.txt", "w") for line in one: line0 = line.replace("DOCUMENTAIRE", "documentaire") line1=line0.replace("DOCUMENTS", "\n<1>") line2 = line1.replace("DOCUMENT", "\n<1>") print(line2, file=two) one.close() two.close() #block 3: remove redundent lines one = open("…/2.txt", "r") two = open("…/3.txt", "w") for line in one: if line.startswith(""): print(line, end="", file=two) else: continue one.close() two.close() #block 4: remove redundent articles one = open("…/3.txt", "r") two = open("…/4.txt", "w") for line in one: if "LENGTH: " in line: print(line, file=two) else: continue one.close() two.close() #block 5: remove non-paged articles one = open(…/4.txt", "r") two = open("…/5.txt", "w") for line in one: text = line[line.find("SECTION: ")+9:line.find("LENGTH:")] if "Blz." in text: print(line, end="", file=two) elif "Pg." in text: print(line, end="", file=two) else: continue one.close() two.close() #block 5: add page numbers one = open("…/5.txt", "r") two = open("…/6.txt", "w") for line in one: text = line[line.find("SECTION: "):line.find("LENGTH:")] text2 = text[text.find(";"):] text3 = text2.replace("Pg.", "Blz.") text4 = text3[text3.find("Blz."):] text5 = text4.replace(",", " ") text6 = text5.replace("Blz. ", "") text7 = text6.replace("A", "") text8 = text7.replace("C", "") text9 = text8.replace("B", "") text10 = text9.replace("D", "") text11 = text10.replace("E", "") text12 = text11.replace("T", "") text13 = text12.replace("Q", "") text14 = text13.replace("F", "") text15 = text14.replace("G", "") text16 = text15.replace("W", "") if len(text16.split()) == 1: print("", text16, line, end="", file=two) elif len(text6.split()) == 0: print("0", line, end="", file=two) one.close() two.close() #block 6: newspaper names one = open("…/6.txt", "r") two = open("…7.txt", "w") for line in one: newspaper = line[line.find("<1>")+3:line.find("20")] if "Algemeen Dagblad" in newspaper: newspaper2 = "algemeen dagblad" line1 = line.replace("Algemeen Dagblad", " ") elif "Daily Mirror" in newspaper: newspaper2 = "daily mirror" line1 = line.replace("Daily Mirror", " ") elif "Daily Star" in newspaper: newspaper2 = "daily star" line1 = line.replace("Daily Star", " ") elif "Daily Telegraph" in newspaper: newspaper2 = "daily telegraph" line1 = line.replace("Daily Telegraph", " ") elif "Evening Standard" in newspaper: newspaper2 = "evening standard" line1 = line.replace("Evening Standard", " ") elif "NRC" in newspaper: newspaper2 = "NRC" line1 = line.replace("NRC", " ") elif "Daily News" in newspaper: newspaper2 = "daily news" line1 = line.replace("Daily News", " ") elif "New York Times" in newspaper: newspaper2 = "new york times" line1 = line.replace("New York Times", " ") elif "Telegraaf" in newspaper: newspaper2 = "telegraaf" line1 = line.replace("Telegraaf", " ") elif "USA TODAY" in newspaper: newspaper2 = "usa today" line1 = line.replace("USA TODAY", " ") elif "Volkskrant" in newspaper: newspaper2 = "volkskrant" line1 = line.replace("Volkskrant", " ") elif "Washington Post" in newspaper: newspaper2 = "washington post" line1 = line.replace("Washington Post", " ") print("", newspaper2, line1, end="", file=two) one.close() two.close() #block 7: isolate headline one = open(“…/7.txt", "r") two = open("…/8.txt", "w") for line in one: newspaper = line[line.find(""):line.find("")+7] text = line[line.find("<1>"):line.find("SECTION:")] text2 = text.lower() text3 = text2.replace("maandag", "") text4 = text3.replace("monday", "") text5 = text4.replace("dinsdag", "") text6 = text5.replace("tuesday", "") text7 = text6.replace("woensdag", "") text8 = text7.replace("wednesday", "") text9 = text8.replace("donderdag", "") text10 = text9.replace("thursday", "") text11 = text10.replace("vrijdag", "") text12 = text11.replace("friday", "") text13 = text12.replace("zaterdag", "") text14 = text13.replace("saturday", "") text15 = text14.replace("zondag", "") text16 = text15.replace("sunday", "") text16 = text15.replace("sunday", "") text17 = text16[text16.find("")+5:] rest = line[line.find("SECTION:"):] print(newspaper, text17, rest, end="", file=two) one.close() two.close() #block 9: break into sections one = open("…/8.txt", "r") two = open("…/9.txt", "w") for line in one: if "LENGTH:" in line: newspaper = line[line.find(" ")+12:line.find(" ")] headline = line[line.find(" ")+8:line.find(" SECTION:")] if "national edition" in headline: if "byline:" in headline: headline1 = headline[headline.find("national edition")+16:headline.find("byline:")] else: headline1 = headline[headline.find("national edition")+16:headline.find("SECTION:")] elif "byline:" in headline: headline1 = headline[:headline.find("byline:")] else: headline1 = headline headline2 = headline1.replace(" ", " ") headline3 = headline2.replace(";", "") headline4 = headline3.replace("edition 1", "") headline5 = headline4.replace("sports final edition", "") headline6 = headline5.replace("sports final replate edition", "") headline7 = headline6.replace("late edition - final", "") headline8 = headline7.replace("final edition", "") headline9 = headline8.replace("every edition", "") headline10 = headline9.replace("regional edition", "") headline11 = headline10.replace("suburban edition", "") section = line[line.find("SECTION: ")+9:line.find(" LENGTH:")] section2 = section.lower() section3 = section2.replace(";", "") section4 = section3.replace("blz.", "pg.") section5 = section4[:section4.find("pg.")] length = line[line.find("LENGTH:")+7:line.find(" woorden")] text = line[line.find("LENGTH: ")+8:line.find("LOAD-DATE")] text1 = " ".join(text.split()[2:]) text2 = text1.replace(","," ") text3 = text2.replace("."," ") text4 = text3.replace("-"," ") text5 = text4.replace("!", " ") text6 = text5.replace("?", " ") text7 = text6.replace(":", " ") text8 = text7.replace(";", " ") text9 = text8.replace("_", " ") text10 = text9.replace("\#", " ") text11 = text10.replace("&", " ") text12 = text11.replace("\'", " ") text13 = text12.replace("\"", " ") text14 = text13.replace("(", " ") text15 = text14.replace(")", " ") text16 = text15.lower() year = line[line.find("LOAD-DATE: ")+11:] year2 = year.split()[2] if "Januari" in year: month = "january" elif "January" in year: month = "january" elif "Februari" in year: month = "februari" elif "February" in year: month = "februari" elif "Maart" in year: month = "march" elif "March" in year: month = "march" elif "April" in year: month = "april" elif "Mei" in year: month = "may" elif "May" in year: month = "may" elif "Juni" in year: month = "june" elif "June" in year: month = "june" elif "Juli" in year: month = "july" elif "july" in year: month = "july" elif "Augustus" in year: month = "august" elif "August" in year: month = "august" elif "Sepember" in year: month = "september" elif "Oktober" in year: month = "october" elif "October" in year: month = "october" elif "November" in year: month = "november" elif "December" in year: month = "december" pubtype = line[line.find("PUBLICATION-TYPE: "):line.find(" JOURNAL-CODE:")] pubtype2 = pubtype[pubtype.find("PUBLICATION-TYPE: ")+18:pubtype.find(" Copyright")] pubtype3 = pubtype2.lower() print("", newspaper, "", headline11, "
", section5, "", length,"", year2, " ", month, "", pubtype3, "", text16, sep="", end="\n", file=two) one.close() two.close() #Block 8: remove duplicate headlines one = open("…/9.txt", "r") two = open("…/10.txt", "w") headlines = set() for line in one: text = line[line.find(" ")+7:line.find("
")] if text in headlines: continue else: print(line, end="", file=two) headlines.add(text) one.close() two.close() #block 9: remove duplicate texts one = open("…/10.txt", "r") two = open("…/clean.txt", "w") texts = set() for line in one: text = line[line.find("")+6:] if text in texts: continue else: print(line, end="", file=two) texts.add(text) one.close() two.close() #block10: line count one = open("…/clean.txt", "r") lines = one.readlines() count = len(lines) one.close() print(count) os.remove("…/1.txt") os.remove("…/2.txt") os.remove("…/3.txt") os.remove("…/4.txt") os.remove("…/5.txt") os.remove("…/6.txt") os.remove("…/7.txt") os.remove("…8.txt") os.remove("…/9.txt") os.remove("…/10.txt") print("done cleaning") def print_text(self): #prints the text to compile the dictionaries one = open("…/clean.txt", "r") two = open("…/text.txt", "w") for line in one: line1 = line[line.find(""):] print(line1, end="\n", file=two) one.close() two.close() print("printed") def appearances(self): #add artist name and number it appears in the text one = open("…/clean.txt", "r") two = open("…/cleanv1.txt", "w") for line in one: text = line[line.find(""):] countweiwei = text.count("weiwei") countbanksy = text.count("banksy") counthaacke = text.count("haacke") countpanahi = text.count("panahi") countjonas = text.count("jonas") countpr = text.count("pussy riot") headline = line[line.find("")+6:line.find("
")] countpr = text.count("pussy riot") print("", countweiwei, "", countbanksy, "", counthaacke, "", countpanahi, "", countjonas, "", countpr, line, sep="", end="", file=two) one.close() two.close() os.remove("…/clean.txt") print("appearances done") def content_analysis(self): #uses two dictionaries to count how often artistic and political words appear in text one = open("…/cleanv1.txt", "r") art = open("… /art.txt", "r") pol = open("…/pol.txt", "r") two = open("…/cleanv2.txt", "w") artwords = art.read().lower() artwords1 = artwords.split("\n") polwords = pol.read().lower() polwords1 = polwords.split("\n") for line in one: count1 = 0 count2 = 0 line1 = line[line.find(""):] for word in line1.split(" "): if word in artwords1: count1 += 1 elif word in polwords1: count2 += 1 print("", count1, "", count2, line, sep="", end="", file=two) one.close() art.close() pol.close() two.close() os.remove("…/cleanv1.txt") print("analysis in cleanv2.txt") def presence(self): #uses dictionaries to count how often the artist is present one = open("…/cleanv2.txt", "r") weiwei = open("…/weiwei.txt", "r") banksy = open("…/banksy.txt", "r") haacke = open("…/haacke.txt", "r") panahi = open("…/panahi.txt", "r") jonas = open("…/jonas.txt", "r") pr = open("…/pr.txt", "r") two = open("…/cleanv3.txt", "w") weiweiwords = weiwei.read().lower() weiweiwords1 = weiweiwords.split("\n") banksywords = banksy.read().lower() banksywords1 = banksywords.split("\n") haackewords = haacke.read().lower() haackewords1 = haackewords.split("\n") panahiwords = panahi.read().lower() panahiwords1 = panahiwords.split("\n") jonaswords = jonas.read().lower() jonaswords1 = jonaswords.split("\n") prwords = pr.read().lower() prwords1 = prwords.split("\n") for line in one: count1 = 0 count2 = 0 count3 = 0 count4 = 0 count5 = 0 count6 = 0 line1 = line[line.find(""):] for word in line1.split(" "): if word in weiweiwords1: count1 += 1 elif word in banksywords1: count2 += 1 elif word in haackewords1: count3 += 1 elif word in panahiwords1: count4 += 1 elif word in jonaswords1: count5 += 1 elif word in prwords1: count6 += 1 print("", count1, "", count2, "", count3, "", count4, "", count5, "", count6, line, sep="", end="", file=two) one.close() weiwei.close() banksy.close() haacke.close() panahi.close() jonas.close() pr.close() two.close() os.remove("…/cleanv2.txt") print("analysis in cleanv3.txt") def import_ready(self): #make ready for import for excel or rstudio with ; as delimiter #uses two dictionaries to count how often artistic and political words appear in text one = open("…/cleanv5.txt", "r") two = open("…/clean_sep.txt", "w") print("sectionsrecoded", "weiweipres", "banksypres", "haackepres", "panahipres", "jonaspres", "prpres", "art", "pol", "weiwei", "banksy", "haacke", "panahi", "jonas", "pr", "newspaper", "page", "headline", "section", "length", "date", "type", "text", sep=";", end="\n", file=two) for line in one: line1 = line.replace("", ";") line2 = line1.replace("", ";") line3 = line2.replace("", ";") line4 = line3.replace("", ";") line5 = line4.replace("", ";") line6 = line5.replace("", ";") line7 = line6.replace("", ";") line8 = line7.replace("", ";") line9 = line8.replace("", ";") line10 = line9.replace("", ";") line11 = line10.replace("
", ";") line12 = line11.replace("", ";") line13 = line12.replace("", ";") line14 = line13.replace("", ";") line15 = line14.replace("", ";") line16 = line15.replace("", ";") line17 = line16.replace("", ";") line18 = line17.replace("", ";") line19 = line18.replace("", ";") line20 = line19.replace("", ";") line21 = line20.replace("", ";") line22 = line21.replace("", ";") line23 = line22.replace("", ";") line24 = line23.replace("", "") print(line24, end="", file=two) one.close() two.close() print("import file done") class TextRedirector(object): def __init__(self, widget, tag="stdout"): self.widget = widget self.tag = tag def write(self, str): self.widget.configure(state="normal") self.widget.insert("end", str, (self.tag,)) self.widget.configure(state="disabled") app = ExampleApp() app.geometry("630x750") app.title("data processor") app.mainloop()