Next, we'll break the big block of text into smaller pieces: first into sentences, and then into individual words. This process is called tokenization. After that, weβll count how many times each word appears.
# We need a special toolkit for this part. NLTK is a famous library for working with human language.
import nltk
# From the toolkit, we grab the tool that splits text into sentences.
from nltk import sent_tokenize
# We also grab the tool that splits text into words.
from nltk import word_tokenize
# This downloads a collection of helpful data and models that NLTK needs to work properly.
nltk.download("popular")
# This downloads a specific model called 'punkt', which is great for tokenizing.
nltk.download('punkt_tab')
# We use the sentence tokenizer to split our 'text' into a list of sentences.
sentences = sent_tokenize(text)
# Let's see how many sentences we got.
print(f"Number of sentences: {len(sentences)}")
# Now, we use the word tokenizer to split our 'text' into a list of words.
words = word_tokenize(text)
# Let's see how many words we have in total.
print(f"Number of words: {len(words)}")
# To count the words, we'll use a special tool called FreqDist (Frequency Distribution).
from nltk.probability import FreqDist
# This creates a frequency counter from our list of words.
fdist = FreqDist(words)
# Let's see the 10 most common words in our text.
print("10 Most Common Words:")
print(fdist.most_common(10))
# We'll use this library to draw a graph (a plot).
import matplotlib.pyplot as plt
# This command creates a bar chart of our 10 most common words.
fdist.plot(10, title="Frequency Distribution of Top 10 Words (with punctuation)")
# This shows the plot we just made.
plt.show()
Our word list is a bit messyβit includes punctuation (like '.', ',') and common words (like 'the', 'a', 'is') that don't tell us much. This step is like washing our data to clean it up.
# --- Removing Punctuation ---
# We'll create a new, empty list to hold our words without any punctuation.
words_no_punc = []
# We'll look at each word in our original 'words' list, one by one.
for w in words:
# The isalpha() function checks if the word contains ONLY letters.
if w.isalpha():
# If it's a real word, we convert it to lowercase and add it to our new list.
words_no_punc.append(w.lower())
# Let's see how many words are left after we removed the punctuation.
print(f"Number of words after removing punctuation: {len(words_no_punc)}")
# Let's count the frequencies again with our cleaner list.
fdist_no_punc = FreqDist(words_no_punc)
print("\\\\n10 Most Common Words (no punctuation):")
print(fdist_no_punc.most_common(10))
# --- Removing Stopwords ---
# Stopwords are common words that are usually ignored in text analysis. Let's get a standard list of them.
from nltk.corpus import stopwords
# We load the list of English stopwords (like 'the', 'in', 'a', 'is').
stop_words = stopwords.words("english")
# Let's create one more list, this one for our super-clean words.
clean_words = []
# We'll look at each word in our 'no punctuation' list.
for w in words_no_punc:
# We check if the word is NOT in our list of stopwords.
if w not in stop_words:
# If it's not a stopword, we add it to our final 'clean_words' list.
clean_words.append(w)
# Let's see the final count of meaningful words.
print(f"\\\\nNumber of words after removing stopwords: {len(clean_words)}")
# Let's do our frequency count one last time on the cleanest data.
fdist_clean = FreqDist(clean_words)
print("\\\\n10 Most Common Words (clean):")
print(fdist_clean.most_common(10))
# Finally, let's plot our new, more meaningful results.
fdist_clean.plot(10, title="Frequency Distribution of Top 10 Clean Words")
plt.show()
This block visualizes the most frequent words from the text in the form of a word cloud. It shows both a standard rectangular cloud and one shaped by a custom mask.