import tkinter as tk
import sys
import os
import re
from collections import Counter

########## to consider before running ##########

"""
For cleaner check:
- block 4: JOURNAL-CODE is not always given.
- block 6: change artist
"""


class ExampleApp(tk.Tk):

    def __init__(self):
        tk.Tk.__init__(self)
        toolbar = tk.Frame(self)
        toolbar.pack(side="top", fill="x")
        b1 = tk.Button(self, text="cleaner", command=self.cleaner)
        b2 = tk.Button(self, text="print text", command=self.print_text)
        b3 = tk.Button(self, text="appearances", command=self.appearances)
        b4 = tk.Button(self, text="content analysis", command=self.content_analysis)
        b5 = tk.Button(self, text="artist presence", command=self.presence)
        b6 = tk.Button(self, text="import ready", command=self.import_ready)
        b1.pack(in_=toolbar, side="top", fill="both")
        b2.pack(in_=toolbar, side="top", fill="both")
        b3.pack(in_=toolbar, side="top", fill="both")
        b4.pack(in_=toolbar, side="top", fill="both")
        b5.pack(in_=toolbar, side="top", fill="both")
        b6.pack(in_=toolbar, side="top", fill="both")
        self.text = tk.Text(self, wrap="word")
        self.text.pack(side="top", fill="both", expand=True)
        self.text.tag_configure("stderr", foreground="#b22222")

        sys.stdout = TextRedirector(self.text, "stdout")
        sys.stderr = TextRedirector(self.text, "stderr")


    def cleaner(self):
        #block1: join the lines
        one = open("…/start.txt", "r")
        two = open("…/1.txt", "w")
        with open("…/start.txt") as one:
            print(" ".join(line.strip() for line in one), sep = "", file=two)
        one.close()
        two.close()

        #block 2: add <start>
        one = open("…/1.txt", "r")
        two = open("…/2.txt", "w")
        for line in one:
            line0 = line.replace("DOCUMENTAIRE", "documentaire")
            line1=line0.replace("DOCUMENTS", "\n<start><1>")
            line2 = line1.replace("DOCUMENT", "\n<start><1>")
            print(line2, file=two)
        one.close()
        two.close()

        #block 3: remove redundent lines
        one = open("…/2.txt", "r")
        two = open("…/3.txt", "w")
        for line in one:
            if line.startswith("<start>"):
                print(line, end="", file=two)
            else:
                continue
        one.close()
        two.close()

        #block 4: remove redundent articles
        one = open("…/3.txt", "r")
        two = open("…/4.txt", "w")
        for line in one:
            if "LENGTH: " in line:
                print(line, file=two)
            else:
                continue
        one.close()
        two.close()

        #block 5: remove non-paged articles
        one = open(…/4.txt", "r")
        two = open("…/5.txt", "w")
        for line in one:
            text = line[line.find("SECTION: ")+9:line.find("LENGTH:")]
            if "Blz." in text:
                print(line, end="", file=two)
            elif "Pg." in text:
                print(line, end="", file=two)
            else:
                continue
        one.close()
        two.close()

        #block 5: add page numbers
        one = open("…/5.txt", "r")
        two = open("…/6.txt", "w")
        for line in one:
            text = line[line.find("SECTION: "):line.find("LENGTH:")]
            text2 = text[text.find(";"):]
            text3 = text2.replace("Pg.", "Blz.")
            text4 = text3[text3.find("Blz."):]
            text5 = text4.replace(",", " ")
            text6 = text5.replace("Blz. ", "")
            text7 = text6.replace("A", "")
            text8 = text7.replace("C", "")
            text9 = text8.replace("B", "")
            text10 = text9.replace("D", "")
            text11 = text10.replace("E", "")
            text12 = text11.replace("T", "")
            text13 = text12.replace("Q", "")
            text14 = text13.replace("F", "")
            text15 = text14.replace("G", "")
            text16 = text15.replace("W", "")
            if len(text16.split()) == 1:
                print("<page>", text16, line, end="", file=two)
            elif len(text6.split()) == 0:
                print("<page>0", line, end="", file=two)
        one.close()
        two.close()

        #block 6: newspaper names
        one = open("…/6.txt", "r")
        two = open("…7.txt", "w")
        for line in one:
            newspaper = line[line.find("<1>")+3:line.find("20")]
            if "Algemeen Dagblad" in newspaper:
                newspaper2 = "algemeen dagblad"
                line1 = line.replace("Algemeen Dagblad", " ")
            elif "Daily Mirror" in newspaper:
                newspaper2 = "daily mirror"
                line1 = line.replace("Daily Mirror", " ")
            elif "Daily Star" in newspaper:
                newspaper2 = "daily star"
                line1 = line.replace("Daily Star", " ")
            elif "Daily Telegraph" in newspaper:
                newspaper2 = "daily telegraph"
                line1 = line.replace("Daily Telegraph", " ")
            elif "Evening Standard" in newspaper:
                newspaper2 = "evening standard"
                line1 = line.replace("Evening Standard", " ")
            elif "NRC" in newspaper:
                newspaper2 = "NRC"
                line1 = line.replace("NRC", " ")
            elif "Daily News" in newspaper:
                newspaper2 = "daily news"
                line1 = line.replace("Daily News", " ")
            elif "New York Times" in newspaper:
                newspaper2 = "new york times"
                line1 = line.replace("New York Times", " ")
            elif "Telegraaf" in newspaper:
                newspaper2 = "telegraaf"
                line1 = line.replace("Telegraaf", " ")
            elif "USA TODAY" in newspaper:
                newspaper2 = "usa today"
                line1 = line.replace("USA TODAY", " ")
            elif "Volkskrant" in newspaper:
                newspaper2 = "volkskrant"
                line1 = line.replace("Volkskrant", " ")
            elif "Washington Post" in newspaper:
                newspaper2 = "washington post"
                line1 = line.replace("Washington Post", " ")
            print("<newspaper>", newspaper2, line1, end="", file=two)
        one.close()
        two.close()

        #block 7: isolate headline
        one = open(“…/7.txt", "r")
        two = open("…/8.txt", "w")
        for line in one:
            newspaper = line[line.find("<newspaper>"):line.find("<start>")+7]
            text = line[line.find("<1>"):line.find("SECTION:")]
            text2 = text.lower()
            text3 = text2.replace("maandag", "<dag>")
            text4 = text3.replace("monday", "<dag>")
            text5 = text4.replace("dinsdag", "<dag>")
            text6 = text5.replace("tuesday", "<dag>")
            text7 = text6.replace("woensdag", "<dag>")
            text8 = text7.replace("wednesday", "<dag>")
            text9 = text8.replace("donderdag", "<dag>")
            text10 = text9.replace("thursday", "<dag>")
            text11 = text10.replace("vrijdag", "<dag>")
            text12 = text11.replace("friday", "<dag>")
            text13 = text12.replace("zaterdag", "<dag>")
            text14 = text13.replace("saturday", "<dag>")
            text15 = text14.replace("zondag", "<dag>")
            text16 = text15.replace("sunday", "<dag>")
            text16 = text15.replace("sunday", "<dag>")
            text17 = text16[text16.find("<dag>")+5:]
            rest = line[line.find("SECTION:"):]
            print(newspaper, text17, rest, end="", file=two)
        one.close()
        two.close()
        
        #block 9: break into sections
        one = open("…/8.txt", "r")
        two = open("…/9.txt", "w")
        for line in one:
            if "LENGTH:" in line:
                newspaper = line[line.find("<newspaper> ")+12:line.find(" <start>")]
                headline = line[line.find("<start> ")+8:line.find(" SECTION:")]
                if "national edition" in headline:
                    if "byline:" in headline:
                        headline1 = headline[headline.find("national edition")+16:headline.find("byline:")]
                    else:
                        headline1 = headline[headline.find("national edition")+16:headline.find("SECTION:")]    
                elif "byline:" in headline:
                    headline1 = headline[:headline.find("byline:")]
                else:
                    headline1 = headline
                headline2 = headline1.replace("  ", " ")
                headline3 = headline2.replace(";", "")
                headline4 = headline3.replace("edition 1", "")
                headline5 = headline4.replace("sports final edition", "")
                headline6 = headline5.replace("sports final replate edition", "")
                headline7 = headline6.replace("late edition - final", "")
                headline8 = headline7.replace("final edition", "")
                headline9 = headline8.replace("every edition", "")
                headline10 = headline9.replace("regional edition", "")
                headline11 = headline10.replace("suburban edition", "")
                section = line[line.find("SECTION: ")+9:line.find(" LENGTH:")]
                section2 = section.lower()
                section3 = section2.replace(";", "")
                section4 = section3.replace("blz.", "pg.")
                section5 = section4[:section4.find("pg.")]
                length = line[line.find("LENGTH:")+7:line.find(" woorden")]
                text = line[line.find("LENGTH: ")+8:line.find("LOAD-DATE")]
                text1 = " ".join(text.split()[2:])
                text2 = text1.replace(","," ")
                text3 = text2.replace("."," ")
                text4 = text3.replace("-"," ")
                text5 = text4.replace("!", " ")
                text6 = text5.replace("?", " ")
                text7 = text6.replace(":", " ")
                text8 = text7.replace(";", " ")
                text9 = text8.replace("_", " ")
                text10 = text9.replace("\#", " ")
                text11 = text10.replace("&", " ")
                text12 = text11.replace("\'", " ")
                text13 = text12.replace("\"", " ")
                text14 = text13.replace("(", " ")
                text15 = text14.replace(")", " ")
                text16 = text15.lower()
                year = line[line.find("LOAD-DATE: ")+11:]
                year2 = year.split()[2]
                if "Januari" in year:
                    month = "january"
                elif "January" in year:
                    month = "january"
                elif "Februari" in year:
                    month = "februari"
                elif "February" in year:
                    month = "februari"
                elif "Maart" in year:
                    month = "march"
                elif "March" in year:
                    month = "march"
                elif "April" in year:
                    month = "april"
                elif "Mei" in year:
                    month = "may"
                elif "May" in year:
                    month = "may"
                elif "Juni" in year:
                    month = "june"
                elif "June" in year:
                    month = "june"
                elif "Juli" in year:
                    month = "july"
                elif "july" in year:
                    month = "july"
                elif "Augustus" in year:
                    month = "august"
                elif "August" in year:
                    month = "august"
                elif "Sepember" in year:
                    month = "september"
                elif "Oktober" in year:
                    month = "october"
                elif "October" in year:
                    month = "october"
                elif "November" in year:
                    month = "november"
                elif "December" in year:
                    month = "december"
                pubtype = line[line.find("PUBLICATION-TYPE: "):line.find(" JOURNAL-CODE:")]
                pubtype2 = pubtype[pubtype.find("PUBLICATION-TYPE: ")+18:pubtype.find(" Copyright")]
                pubtype3 = pubtype2.lower()
                print("<newspaper>", newspaper, "<head>", headline11, "<section>", section5,
                      "<length>", length,"<date>", year2, " ", month, "<type>", pubtype3, "<text>", text16,
                      sep="", end="\n", file=two)
        one.close()
        two.close() 


        #Block 8: remove duplicate headlines
        one = open("…/9.txt", "r")
        two = open("…/10.txt", "w")
        headlines = set()
        for line in one:
            text = line[line.find("<head> ")+7:line.find("<section>")]
            if text in headlines:
                continue
            else:
                print(line, end="", file=two)
                headlines.add(text)
        one.close()
        two.close()

        #block 9: remove duplicate texts
        one = open("…/10.txt", "r")
        two = open("…/clean.txt", "w")
        texts = set()
        for line in one:
            text = line[line.find("<text>")+6:]
            
            if text in texts:
                continue
            else:
                print(line, end="", file=two)
                texts.add(text)
        one.close()
        two.close()

        
        #block10: line count
        one = open("…/clean.txt", "r")
        lines = one.readlines()
        count = len(lines)
        one.close()
        print(count)
        os.remove("…/1.txt")
        os.remove("…/2.txt")
        os.remove("…/3.txt")
        os.remove("…/4.txt")
        os.remove("…/5.txt")
        os.remove("…/6.txt")
        os.remove("…/7.txt")
        os.remove("…8.txt")
        os.remove("…/9.txt")
        os.remove("…/10.txt")
        print("done cleaning")


    def print_text(self):
        #prints the text to compile the dictionaries
        one = open("…/clean.txt", "r")
        two = open("…/text.txt", "w")
        for line in one:
            line1 = line[line.find("<text>"):]
            print(line1, end="\n", file=two)
        one.close()
        two.close()
        print("printed")


     
    def appearances(self):
        #add artist name and number it appears in the text
        one = open("…/clean.txt", "r")
        two = open("…/cleanv1.txt", "w")
        for line in one:
            text = line[line.find("<text>"):]
            countweiwei = text.count("weiwei")
            countbanksy = text.count("banksy")
            counthaacke = text.count("haacke")
            countpanahi = text.count("panahi")
            countjonas = text.count("jonas")
            countpr = text.count("pussy riot")
            headline = line[line.find("<head>")+6:line.find("<section>")]
            countpr = text.count("pussy riot")
            print("<countweiwei>", countweiwei, "<countbanksy>", countbanksy,
                  "<counthaacke>", counthaacke, "<countpanahi>", countpanahi,
                  "<countjonas>", countjonas, "<countpr>", countpr, line,
                  sep="", end="", file=two)            
        one.close()
        two.close()
        os.remove("…/clean.txt")
        print("appearances done")
        


    def content_analysis(self):
        #uses two dictionaries to count how often artistic and political words appear in text
        one = open("…/cleanv1.txt", "r")
        art = open("… /art.txt", "r")
        pol = open("…/pol.txt", "r")
        two = open("…/cleanv2.txt", "w")
        artwords = art.read().lower()
        artwords1 = artwords.split("\n")
        polwords = pol.read().lower()
        polwords1 = polwords.split("\n")
        for line in one:
            count1 = 0
            count2 = 0
            line1 = line[line.find("<text>"):]
            for word in line1.split(" "):
                if word in artwords1:
                    count1 += 1
                elif word in polwords1:
                    count2 += 1
            print("<art>", count1, "<pol>", count2, line, sep="", end="", file=two)
        one.close()
        art.close()
        pol.close()
        two.close()
        os.remove("…/cleanv1.txt")
        print("analysis in cleanv2.txt")



    def presence(self):
        #uses  dictionaries to count how often the artist is present
        one = open("…/cleanv2.txt", "r")
        weiwei = open("…/weiwei.txt", "r")
        banksy = open("…/banksy.txt", "r")
        haacke = open("…/haacke.txt", "r")
        panahi = open("…/panahi.txt", "r")
        jonas = open("…/jonas.txt", "r")
        pr = open("…/pr.txt", "r")
        two = open("…/cleanv3.txt", "w")
        weiweiwords = weiwei.read().lower()
        weiweiwords1 = weiweiwords.split("\n")
        banksywords = banksy.read().lower()
        banksywords1 = banksywords.split("\n")
        haackewords = haacke.read().lower()
        haackewords1 = haackewords.split("\n")
        panahiwords = panahi.read().lower()
        panahiwords1 = panahiwords.split("\n")
        jonaswords = jonas.read().lower()
        jonaswords1 = jonaswords.split("\n")
        prwords = pr.read().lower()
        prwords1 = prwords.split("\n")
        for line in one:
            count1 = 0
            count2 = 0
            count3 = 0
            count4 = 0
            count5 = 0
            count6 = 0
            line1 = line[line.find("<text>"):]
            for word in line1.split(" "):
                if word in weiweiwords1:
                    count1 += 1
                elif word in banksywords1:
                    count2 += 1
                elif word in haackewords1:
                    count3 += 1
                elif word in panahiwords1:
                    count4 += 1
                elif word in jonaswords1:
                    count5 += 1
                elif word in prwords1:
                    count6 += 1
            print("<weiweipresence>", count1, "<banksypresence>", count2, "<haackepresence>", count3, "<panahipresence>", count4, "<jonaspresence>", count5, "<prpresence>", count6, line, sep="", end="", file=two)
        one.close()
        weiwei.close()
        banksy.close()
        haacke.close()
        panahi.close()
        jonas.close()
        pr.close()
        two.close()
        os.remove("…/cleanv2.txt")
        print("analysis in cleanv3.txt")
        


    def import_ready(self):
#make ready for import for excel or rstudio with ; as delimiter 
#uses two dictionaries to count how often artistic and political words appear in text
        one = open("…/cleanv5.txt", "r")
        two = open("…/clean_sep.txt", "w")
        print("sectionsrecoded", "weiweipres", "banksypres", "haackepres", "panahipres", "jonaspres", "prpres", "art", "pol", "weiwei", "banksy", "haacke", "panahi", "jonas", "pr", "newspaper", "page", "headline", "section", "length", "date", "type", "text", sep=";", end="\n", file=two)
        for line in one:
            line1 = line.replace("<art>", ";")
            line2 = line1.replace("<pol>", ";")
            line3 = line2.replace("<countweiwei>", ";")
            line4 = line3.replace("<countbanksy>", ";")
            line5 = line4.replace("<counthaacke>", ";")
            line6 = line5.replace("<countpanahi>", ";")
            line7 = line6.replace("<countjonas>", ";")
            line8 = line7.replace("<countpr>", ";")
            line9 = line8.replace("<newspaper>", ";")
            line10 = line9.replace("<head>", ";")
            line11 = line10.replace("<section>", ";")
            line12 = line11.replace("<length>", ";")
            line13 = line12.replace("<date>", ";")
            line14 = line13.replace("<type>", ";")
            line15 = line14.replace("<text>", ";")
            line16 = line15.replace("<start>", ";")
            line17 = line16.replace("<page>", ";")
            line18 = line17.replace("<weiweipresence>", ";")
            line19 = line18.replace("<banksypresence>", ";")
            line20 = line19.replace("<haackepresence>", ";")
            line21 = line20.replace("<panahipresence>", ";")
            line22 = line21.replace("<jonaspresence>", ";")
            line23 = line22.replace("<prpresence>", ";")
            line24 = line23.replace("<sectionrecoded>", "")
            print(line24, end="", file=two)
        one.close()
        two.close()
        print("import file done")

        
class TextRedirector(object):
    def __init__(self, widget, tag="stdout"):
        self.widget = widget
        self.tag = tag

    def write(self, str):
        self.widget.configure(state="normal")
        self.widget.insert("end", str, (self.tag,))
        self.widget.configure(state="disabled")

app = ExampleApp()
app.geometry("630x750")
app.title("data processor")
app.mainloop()