TFIDF-checkpoint.ipynb 6.58 KB

Dataset and Imports

import pandas as pd
 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
 
# this is a very toy example, do not try this at home unless you want to understand the usage differences
docs=["the", "to", "ect", "and", "for", "of", "a", "you", "hou", "in", "on", "is", "this", "enron", "i", "be", "that", "will",
      "have", "with", "your","at", "we", "are", "it", "by", "com", "as", "from", "gas", "or","not", "not", "me", "deal", "if",
      "meter","hpl", "please","re", "e", "any", "our", "corp","can", "d", "all", "has", "was", "know", "need", "an", "forwarded", 
      "new", "t", "may", "up", "j","should", "do", "am", "out", "see", "no", "there", "price", "daren", "but", "been", "company", 
      "I", "these", "let", "so", "would", "m", "into", "xls", "farmer", "attached", "us", "information", "they", "message", 
      "day", "time", "my", "one", "what", "only", "http", "th", "volume", "mail", "contract", "which", "month",
      "more", "robert", "sitara", "obout", "texas", "nom", "energy", "pec", "questions", "www", "deals", "volumes", "pm", "ena",
      "now", "their", "file", "some", "email", "just", "also", "call", "change", "other", "here", "like", "b", "flow", "net", 
      "following", "p", "production","when", "over", "back", "want", "original", "them", "below", "o", "ticket", "c", "he",
      "could", "make", "inc", "report", "march", "contact", "were", "days", "list", "nomination", "system", "who", "april", 
      "number", "sale", "don", "its", "first", "thanks", "business","help", "per", "through", "july", "forward", "font", "free", 
      "daily", "use", "order", "today", "r", "had", "fw", "set", "plant", "statements", "go", "gary", "oil", "line", "sales", 
      "w", "effective", "well", "tenaska", "take","june","x", "within","nbsp", "she", "how", "north", "america", "being", 
      "under", "next", "week", "than", "january," "la"
     ]

Initialize CountVectorizer

#instantiate CountVectorizer()
cv=CountVectorizer()
 
# this steps generates word counts for the words in your docs
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

Compute the IDF values

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf.sort_values(by=['idf_weights'])

Compute the TFIDF score for your documents

# count matrix
count_vector=cv.transform(docs)
 
# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(count_vector)
feature_names = cv.get_feature_names()
 
#get tfidf vector for first document
first_document_vector=tf_idf_vector[0]
 
#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Tfidfvectorizer Usage

from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
# just send in all your docs here
fitted_vectorizer=tfidf_vectorizer.fit(docs)
tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)