Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/solutions/Problem_9_Words_per_Sentence.py
+++ b/lectures/programming/solutions/Problem_9_Words_per_Sentence.py
@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We split the text into words and sentences using regular expression
+import re
+# For comparison, we also include the NLTK tokenizer
+from nltk.tokenize import sent_tokenize
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
+'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
+'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    text=input_file_10_k.read()
+    
+    # Determine number of sentences and number of words    
+    # Split the text in words to determine the total number of words
+    list_of_words=re.split('\W{1,}', text)
+     # to make sure that empty list elements do not bias the word count, we delete them.
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+    # Determine total number of words
+    word_count=len(list_of_words)
+    
+    
+    # Split the text by symbols that indicate the end of a sentence
+    # to determine the total number of sentences
+    list_of_sentences=re.split('[\.!\?]{1,}', text)
+    while list_of_sentences.count("")>0:
+        list_of_sentences.remove("")
+    # Alternative 1:
+    list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
+    while list_of_sentences_1.count("")>0:
+        list_of_sentences_1.remove("")
+    # Alternative 2:
+    list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
+    while list_of_sentences_2.count("")>0:
+        list_of_sentences_2.remove("")
+    # Incorrect approach:
+    # re.split splits the string by the occurrences of the pattern.
+    # If capturing parentheses, i.e. (), are used in pattern, then the text
+    # of all groups in the pattern are also returned as part of the resulting list. 
+    # See https://docs.python.org/3/library/re.html#re.split for details
+    list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
+    while list_of_sentences_false.count("")>0:
+        list_of_sentences_false.remove("")
+    
+    # For comparison, we also include the NLTK tokenizer
+    list_of_sentences_nltk=sent_tokenize(text)
+    
+    # Determine total number of sentences
+    sentence_count=len(list_of_sentences)
+    sentence_count_1=len(list_of_sentences_1)
+    sentence_count_2=len(list_of_sentences_2)
+    sentence_count_false=len(list_of_sentences_false)
+    sentence_count_nltk=len(list_of_sentences_nltk)
+    
+    # Ratio of # of words over # of sentences
+    wps=word_count/sentence_count
+    wps_1=word_count/sentence_count_1
+    wps_2=word_count/sentence_count_2
+    wps_false=word_count/sentence_count_false
+    wps_nltk=word_count/sentence_count_nltk
+    
+    # Write cik, file name, total number of words, total number of sentences,
+    # and WPS to the output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
+    str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
+    str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
+    str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
+    
+    # Close filing
+    input_file_10_k.close()
+    
+
+print("Finished") 
+output_file.close()
+input_file.close()