whu-textual-analysis/lectures/programming/solutions/Problem_9_Words_per_Sentence.py

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016

@author: Alexander Hillert, Goethe University Frankfurt
"""

# We split the text into words and sentences using regular expression
import re
# For comparison, we also include the NLTK tokenizer
from nltk.tokenize import sent_tokenize

directory="C:/Lehre/Textual Analysis/Programming/Files/"

# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()

# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')

# Split the Input File in separate lines
input_text_line=input_text.split("\n")

# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
    input_text_line.remove("")

# Loop over all lines
for i in range(1,len(input_text_line)):
    print(str(i))
    # split the line into the two variables
    variables=input_text_line[i].split(";")
    # We need the CIK and the filename
    cik=variables[0]
    filename=variables[1]
    filename=filename.replace('.txt','')
    
    # Open the ith 10-K in the list
    input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
    encoding='ascii',errors='ignore')
    text=input_file_10_k.read()
    
    # Determine number of sentences and number of words    
    # Split the text in words to determine the total number of words
    list_of_words=re.split('\W{1,}', text)
     # to make sure that empty list elements do not bias the word count, we delete them.
    while list_of_words.count("")>0:
        list_of_words.remove("")
    # Determine total number of words
    word_count=len(list_of_words)
    
    
    # Split the text by symbols that indicate the end of a sentence
    # to determine the total number of sentences
    list_of_sentences=re.split('[\.!\?]{1,}', text)
    while list_of_sentences.count("")>0:
        list_of_sentences.remove("")
    # Alternative 1:
    list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
    while list_of_sentences_1.count("")>0:
        list_of_sentences_1.remove("")
    # Alternative 2:
    list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
    while list_of_sentences_2.count("")>0:
        list_of_sentences_2.remove("")
    # Incorrect approach:
    # re.split splits the string by the occurrences of the pattern.
    # If capturing parentheses, i.e. (), are used in pattern, then the text
    # of all groups in the pattern are also returned as part of the resulting list. 
    # See https://docs.python.org/3/library/re.html#re.split for details
    list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
    while list_of_sentences_false.count("")>0:
        list_of_sentences_false.remove("")
    
    # For comparison, we also include the NLTK tokenizer
    list_of_sentences_nltk=sent_tokenize(text)
    
    # Determine total number of sentences
    sentence_count=len(list_of_sentences)
    sentence_count_1=len(list_of_sentences_1)
    sentence_count_2=len(list_of_sentences_2)
    sentence_count_false=len(list_of_sentences_false)
    sentence_count_nltk=len(list_of_sentences_nltk)
    
    # Ratio of # of words over # of sentences
    wps=word_count/sentence_count
    wps_1=word_count/sentence_count_1
    wps_2=word_count/sentence_count_2
    wps_false=word_count/sentence_count_false
    wps_nltk=word_count/sentence_count_nltk
    
    # Write cik, file name, total number of words, total number of sentences,
    # and WPS to the output file
    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
    str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
    str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
    str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
    
    # Close filing
    input_file_10_k.close()
    

print("Finished") 
output_file.close()
input_file.close()
Add programming files - add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder 2022-08-05 00:05:05 +02:00			`# -- coding: utf-8 --`
			`"""`
			`Created on Wed Apr 13 22:43:32 2016`

			`@author: Alexander Hillert, Goethe University Frankfurt`
			`"""`

			`# We split the text into words and sentences using regular expression`
			`import re`
			`# For comparison, we also include the NLTK tokenizer`
			`from nltk.tokenize import sent_tokenize`

			`directory="C:/Lehre/Textual Analysis/Programming/Files/"`

			`# Open the csv file containing the list of the 200 10-Ks`
			`input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")`
			`input_text=input_file.read()`

			`# Create output file`
			`output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")`
			`# Write variable names to the first line of the output file`
			`output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\`
			`'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\`
			`'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')`

			`# Split the Input File in separate lines`
			`input_text_line=input_text.split("\n")`

			`# In general, there can be empty lines in the input file. The following command`
			`# deletes these lines.`
			`while input_text_line.count("")>0:`
			`input_text_line.remove("")`

			`# Loop over all lines`
			`for i in range(1,len(input_text_line)):`
			`print(str(i))`
			`# split the line into the two variables`
			`variables=input_text_line[i].split(";")`
			`# We need the CIK and the filename`
			`cik=variables[0]`
			`filename=variables[1]`
			`filename=filename.replace('.txt','')`

			`# Open the ith 10-K in the list`
			`input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\`
			`encoding='ascii',errors='ignore')`
			`text=input_file_10_k.read()`

			`# Determine number of sentences and number of words`
			`# Split the text in words to determine the total number of words`
			`list_of_words=re.split('\W{1,}', text)`
			`# to make sure that empty list elements do not bias the word count, we delete them.`
			`while list_of_words.count("")>0:`
			`list_of_words.remove("")`
			`# Determine total number of words`
			`word_count=len(list_of_words)`


			`# Split the text by symbols that indicate the end of a sentence`
			`# to determine the total number of sentences`
			`list_of_sentences=re.split('[\.!\?]{1,}', text)`
			`while list_of_sentences.count("")>0:`
			`list_of_sentences.remove("")`
			`# Alternative 1:`
			`list_of_sentences_1=re.split('(?:\.\|!\|\?){1,}', text)`
			`while list_of_sentences_1.count("")>0:`
			`list_of_sentences_1.remove("")`
			`# Alternative 2:`
			`list_of_sentences_2=re.split('\.{1,}\|!{1,}\|\?{1,}', text)`
			`while list_of_sentences_2.count("")>0:`
			`list_of_sentences_2.remove("")`
			`# Incorrect approach:`
			`# re.split splits the string by the occurrences of the pattern.`
			`# If capturing parentheses, i.e. (), are used in pattern, then the text`
			`# of all groups in the pattern are also returned as part of the resulting list.`
			`# See https://docs.python.org/3/library/re.html#re.split for details`
			`list_of_sentences_false=re.split('(\.\|!\|\?){1,}', text)`
			`while list_of_sentences_false.count("")>0:`
			`list_of_sentences_false.remove("")`

			`# For comparison, we also include the NLTK tokenizer`
			`list_of_sentences_nltk=sent_tokenize(text)`

			`# Determine total number of sentences`
			`sentence_count=len(list_of_sentences)`
			`sentence_count_1=len(list_of_sentences_1)`
			`sentence_count_2=len(list_of_sentences_2)`
			`sentence_count_false=len(list_of_sentences_false)`
			`sentence_count_nltk=len(list_of_sentences_nltk)`

			`# Ratio of # of words over # of sentences`
			`wps=word_count/sentence_count`
			`wps_1=word_count/sentence_count_1`
			`wps_2=word_count/sentence_count_2`
			`wps_false=word_count/sentence_count_false`
			`wps_nltk=word_count/sentence_count_nltk`

			`# Write cik, file name, total number of words, total number of sentences,`
			`# and WPS to the output file`
			`output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\`
			`str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\`
			`str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\`
			`str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')`

			`# Close filing`
			`input_file_10_k.close()`


			`print("Finished")`
			`output_file.close()`
			`input_file.close()`