whu-textual-analysis/lectures/programming/introductions/NLTK_Sentence_Tokenizer.py

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 17 17:09:50 2021

@author: ahillert
"""

from nltk.tokenize import sent_tokenize

print("\nExample 1\n")
text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \
+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \
+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"

sentence_list_1=sent_tokenize(text_1)

for i in range(0,len(sentence_list_1)):
    print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])

# -> good performance

print("\nExample 2\n")
text_2=text_1.lower()

sentence_list_2=sent_tokenize(text_2)

for i in range(0,len(sentence_list_2)):
    print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])

# -> poor performance
# For the NLTK tokenizer it makes a difference whether text is lower or upper case.


print("\nExample 3\n")
text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \
+"While Jane C. Taylor became the president of the S. E. C. " \
+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."

sentence_list_3=sent_tokenize(text_3)

for i in range(0,len(sentence_list_3)):
    print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])

# -> good performance

print("\nExample 4\n")
text_4=text_3.lower()

sentence_list_4=sent_tokenize(text_4)

for i in range(0,len(sentence_list_4)):
    print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])
Add programming files - add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder 2022-08-05 00:05:05 +02:00			`# -- coding: utf-8 --`
			`"""`
			`Created on Sat Jul 17 17:09:50 2021`

			`@author: ahillert`
			`"""`

			`from nltk.tokenize import sent_tokenize`

			`print("\nExample 1\n")`
			`text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \`
			`+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \`
			`+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"`

			`sentence_list_1=sent_tokenize(text_1)`

			`for i in range(0,len(sentence_list_1)):`
			`print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])`

			`# -> good performance`

			`print("\nExample 2\n")`
			`text_2=text_1.lower()`

			`sentence_list_2=sent_tokenize(text_2)`

			`for i in range(0,len(sentence_list_2)):`
			`print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])`

			`# -> poor performance`
			`# For the NLTK tokenizer it makes a difference whether text is lower or upper case.`


			`print("\nExample 3\n")`
			`text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \`
			`+"While Jane C. Taylor became the president of the S. E. C. " \`
			`+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."`

			`sentence_list_3=sent_tokenize(text_3)`

			`for i in range(0,len(sentence_list_3)):`
			`print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])`

			`# -> good performance`

			`print("\nExample 4\n")`
			`text_4=text_3.lower()`

			`sentence_list_4=sent_tokenize(text_4)`

			`for i in range(0,len(sentence_list_4)):`
			`print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])`