import pandas as pd
reviews = pd.read_csv("https://raw.githubusercontent.com/IshitaGopal/TRIADS_workshops/main/Introduction_to_TextAnalysis/data/movie_reviews_pang02.csv")
reviews.shape

(2000, 2)

reviews.head(2)

reviews.tail(2)

# Let's look at an example review 
reviews["text"][1]

'the happy bastard\'s quick movie review \ndamn that y2k bug . \nit\'s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . \nlittle do they know the power within . . . \ngoing for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . \nwe don\'t know why the crew was really out in the middle of nowhere , we don\'t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don\'t know why donald sutherland is stumbling around drunkenly throughout . \nhere , it\'s just " hey , let\'s chase these people around with some robots " . \nthe acting is below average , even from the likes of curtis . \nyou\'re more likely to get a kick out of her work in halloween h20 . \nsutherland is wasted and baldwin , well , he\'s acting like a baldwin , of course . \nthe real star here are stan winston\'s robot design , some schnazzy cgi , and the occasional good gore shot , like picking into someone\'s brain . \nso , if robots and body parts really turn you on , here\'s your movie . \notherwise , it\'s pretty much a sunken ship of a movie . '

# The positive and negative classes are balanced 
reviews["class"].value_counts()

class
neg    1000
pos    1000
Name: count, dtype: int64

from sklearn.preprocessing import LabelEncoder
# Convert the class variable into an integer representing Pos and Neg
label_encoder = LabelEncoder()
reviews['class_int'] = label_encoder.fit_transform(reviews['class'])
reviews.head(2)

reviews['class_int'].value_counts()

class_int
0    1000
1    1000
Name: count, dtype: int64

from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
train_data, test_data, y_train, y_test = train_test_split(
    reviews['text'], reviews['class_int'], test_size=0.2, random_state=42
)
print("Number of training samples:", train_data.shape)
print("Number of test samples:", test_data.shape)

Number of training samples: (1600,)
Number of test samples: (400,)

# Confirm the order of the rows were shuffled
train_data.head(10)

968     while watching loser , it occurred to me that ...
240     georges polti once wrote a paper called " the ...
819     sylvester stallone has made some crap films in...
692     attention moviegoers : you are about to enter ...
420     plot : something about a bunch of kids going i...
1085     * * * * * * minor plot spoilers in review * *...
1998    steven spielberg's second epic film on world w...
365     welcome to your oh-so typical sequel . \nit tr...
1022    a fully loaded entertainment review - website ...
1240    seen september 13 , 1998 at 4 p . m at rotterd...
Name: text, dtype: object

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ishitagopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

# Create a CountVectorizer instance for feature extraction
# binary = True specifies that instead of counts we are using presence of the word as a feature
vectorizer_binary = CountVectorizer(tokenizer=word_tokenize, binary=True)

# Convert the text data into a document-term matrix
# These are our features 
X_train_binary = vectorizer_binary.fit_transform(train_data)
X_test_binary = vectorizer_binary.transform(test_data)

/opt/anaconda3/lib/python3.11/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

# The above returns sparse matrices but we can look at the numbers 
X_train_binary[0:10, 10:16].todense() # 1 represents presence of the word not counts here

matrix([[0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 0],
        [0, 1, 1, 0, 0, 1]])

# We can store the vocabulary
vocab_dict = vectorizer_binary.vocabulary_ # resturns a dictionary
vocab_list = vectorizer_binary.get_feature_names_out() # returns a list of words in alphabrtical order

print(vocab_list[:50])

['\x05' '\x13earth' '\x13goodies' '\x13if' '\x13ripley' '\x13suspend'
 '\x13they' '\x13white\x14' '\x14' '\x16' '!' '#' '$' '%' '&' "'" "''"
 "'00s" "'10" "'28" "'40s" "'50s" "'58" "'60s" "'60s/'70s" "'69" "'70"
 "'70s" "'77" "'79" "'80" "'80s" "'82" "'90" "'90s" "'92/early" "'93"
 "'94" "'95s" "'96" "'97" "'98" "'act" "'acting" "'almost" "'amateur"
 "'amusing" "'ani" "'animal" "'answer"]

print(vocab_list[-50:])

['zingers' 'zinnemman' 'zinnia' 'zip' 'zipped' 'zippel' 'zipper' 'zippers'
 'zippy' 'zit' 'ziyi' 'zodiac' 'zoe' 'zombie' 'zombie-like'
 'zombie-stomping' 'zombies' 'zombified' 'zone' 'zones' 'zoo' 'zoo-like'
 'zookeeper' 'zookeepers' 'zoolander' 'zoologist' 'zoom' 'zoom-in'
 'zoom-ins' 'zoom-out' 'zoom-outs' 'zooming' 'zooms' 'zoot' 'zophres'
 'zorg' 'zorro' 'zucker' 'zucker/abrahams/zucker' 'zuehlke' 'zuko'
 'zukovsky' 'zulu' 'zurg' 'zweibel' 'zwick' 'zwigoff' 'zycie' '|' '||']

# Use the count vectorizer instead 
# Convert the text data into a document-term matrix
# These are our features 
vectorizer_count = CountVectorizer(tokenizer=word_tokenize)
X_train_count = vectorizer_count.fit_transform(train_data)
X_test_count = vectorizer_count.transform(test_data)

# Instead of presence we see frequency of words here: 
X_train_count[0:10, 10:16].todense()

matrix([[ 0,  0,  0,  0,  0,  2],
        [ 0,  0,  0,  0,  0,  0],
        [ 0,  1,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [13,  0,  0,  0,  0,  1],
        [ 1,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0],
        [ 6,  0,  0,  0,  0,  1],
        [ 1,  0,  0,  0,  0,  0],
        [ 0,  1,  1,  0,  0,  2]])

from sklearn.feature_extraction.text import TfidfVectorizer
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data)
# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(test_data)

X_test_tfidf.shape

(400, 36246)

from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Create an instance of the Naive Bayes classifier
classifier_nb = BernoulliNB()

# Train (fit) the classifier using the training data
classifier_nb.fit(X_train_binary, y_train)

BernoulliNB()

BernoulliNB()

# Get (0/1) predictions for the test set
predictions_nb = classifier_nb.predict(X_test_binary)

# Get the predicted probabilities (between 0 & 1 ) for the test set
predicted_prob_nb = classifier_nb.predict_proba(X_test_binary)

# Each tuple contains a prediction and its corresponding predicted probabilities
# The first probability corresponds to the negative class (0) and the second to the positive (1)
list(zip(predictions_nb[:5], predicted_prob_nb[:5]))

[(1, array([1.22512981e-09, 9.99999999e-01])),
 (1, array([0.01576483, 0.98423517])),
 (1, array([4.96043969e-15, 1.00000000e+00])),
 (0, array([0.99524859, 0.00475141])),
 (1, array([0.14210348, 0.85789652]))]

# the sum of these 2 probabilities will equal 1 like we see below
np.sum(predicted_prob_nb[:5], axis=1)

array([1., 1., 1., 1., 1.])

# Evaluate the classifier
accuracy_nb = accuracy_score(y_test, predictions_nb)
print(f'Accuracy: {accuracy_nb:.2f}')

# Lets also print out the other metrics
print("\nClassification Report:")
print(classification_report(y_test, predictions_nb))

Accuracy: 0.78

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       199
           1       0.87      0.66      0.75       201

    accuracy                           0.78       400
   macro avg       0.80      0.78      0.78       400
weighted avg       0.80      0.78      0.78       400

# Identify the most important features
feature_probabilities_nb = np.exp(classifier_nb.feature_log_prob_)

# calculate (denominator in equation above):
total_word_likelihood = np.sum(feature_probabilities_nb, axis=0)
# Calculate the probability of each word being associated with the negative class (p(w|neg))
prob_negative_given_word = feature_probabilities_nb[0,:]/total_word_likelihood
# Calculate the probability of each word being associated with the positive class (p(w|pos))
prob_positive_given_word = feature_probabilities_nb[1,:]/total_word_likelihood

vocab_list[np.argsort(prob_negative_given_word)[-30:][::-1]]

array(['insulting', 'hudson', 'sucks', 'unimaginative', 'leaden',
       'degenerates', 'wielding', 'horrid', 'suvari', 'mena',
       'pathetically', 'uninvolving', 'seagal', 'by-the-numbers', 'vein',
       'tedium', 'ivy', 'jumbled', 'stupidity', 'turkey', 'aimlessly',
       'nonsense', 'forsythe', 'macdonald', 'plodding', 'kidding',
       'setups', 'chevy', 'hodgepodge', 'priests'], dtype=object)

vocab_list[np.argsort(prob_positive_given_word)[-30:][::-1]]

array(['debate', 'outstanding', 'missteps', 'addresses', 'intimate',
       'refreshingly', 'detract', 'finest', 'passage', 'deft', 'narrates',
       'embodies', 'lovingly', 'gattaca', 'symbol', 'meryl', 'soviet',
       'discussed', 'studies', 'marvelous', 'astounding', 'masterfully',
       'freed', 'fabric', 'tobey', 'lithgow', 'criticized', 'uncut',
       'online', 'magnificent'], dtype=object)

# Plot weights
col_sums = np.sum(X_train_binary, axis=0).tolist()[0]
post_pos = prob_positive_given_word.tolist()

plt.figure(figsize=(16, 12))
plt.scatter(col_sums, post_pos, marker='o', color='black', alpha=0.5, s=20)
plt.title("Posterior Probabilities, Naive Bayes Classifier, IMDB")
plt.xlabel("Total Appearances")
plt.ylabel("<--- Negative Reviews --- Positive Reviews --->")
plt.xlim(0, 1000)
plt.ylim(0, 1)

# Add text labels
for i, txt in enumerate(vocab_list):
        plt.text(col_sums[i], 
                 post_pos[i], 
                 txt, 
                 ha='left', 
                 va='center', 
                 alpha=.30,
                 fontsize=50*abs(.5-post_pos[i]), 
                 color="black")
plt.show()

# Plot weights
plt.figure(figsize=(16, 10))
plt.scatter(col_sums, post_pos, marker='o', color='black', alpha=0.5, s=20)
plt.title("Posterior Probabilities, Naive Bayes Classifier, IMDB")
plt.xlabel("Total Appearances")
plt.ylabel("<--- Negative Reviews --- Positive Reviews --->")
plt.xlim(0, 150)
plt.ylim(0.75, 1)

# Add text labels
for i, txt in enumerate(vocab_list):
    if post_pos[i]>= 0.75:
        plt.text(col_sums[i], 
                 post_pos[i], 
                 txt, 
                 ha='left', 
                 va='center', 
                 alpha=.60,
                 fontsize=20*abs(post_pos[i]), 
                 color="black")
plt.show()

# Plot weights
plt.figure(figsize=(16, 10))
plt.scatter(col_sums, post_pos, marker='o', color='black', alpha=0.5, s=20)
plt.title("Posterior Probabilities, Naive Bayes Classifier, IMDB")
plt.xlabel("Total Appearances")
plt.ylabel("<--- Negative Reviews --- Positive Reviews --->")
plt.xlim(0, 250)
plt.ylim(0, 0.25)

# Add text labels
for i, txt in enumerate(vocab_list):
    if post_pos[i]<= 0.25:
        plt.text(col_sums[i], 
                 post_post[i], 
                 txt, 
                 ha='left', 
                 va='center', 
                 alpha=.50,
                 fontsize=80*abs(post_post[i]), 
                 color="black")
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt

# Predicted probabilities for the positive class (stored at column index 1 in the array)
predicted_prob_nb_neg = predicted_prob_nb[:, 0]
predicted_prob_nb_pos = predicted_prob_nb[:, 1]

# Plot density
sns.kdeplot(predicted_prob_nb_pos, fill=True)
plt.title("Predicted Probabilities from Naive Bayes Classifier")
plt.xlabel("")
plt.ylabel("Density")

# Add rug plot
#plt.yticks([])  # Remove y-axis labels
sns.rugplot(data=predicted_prob_nb_pos, color="black", height=.1)
plt.show()

/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/opt/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

len(y_test)

400

# Find the index of the most positive/least negative probability:
## sort the predicted_prob_neg array in acsending order using np.argsort
## returns the index at the 1st position
least_negative_index = np.argsort(predicted_prob_nb_neg)[0]

# Both of these should return the same index

least_negative_index

290

predicted_prob_nb[least_negative_index]

array([1.27249754e-42, 1.00000000e+00])

y_test.iloc[least_negative_index]

1

test_data.iloc[least_negative_index]

'i know that " funnest " isn\'t a word . \n " fun " is a noun , and therefore cannot be conjugated like an adjective . \nbut that\'s the word that came to me right after viewing " chicken run . " \nno wonder : this is the kind of movie that reduces you to childish expressions , like " that was the funnest movie i\'ve ever seen ! " \nso to hell with webster\'s  -  " chicken run " is one of the funnest movies i\'ve seen in a while . \ni can\'t remember the last time i\'ve seen anything funner . \nthe chickens at tweedy\'s farm are up to something . \nliving in a concentration camp-like atmosphere , they are led by one plucky hen called ginger ( voice of julia sawalha ) who continually comes up with plans for escape  -  and always gets caught , subsequently spending day after day in the coal box . \none night , a brash american rooster flies in over the fence , calling himself rocky ( mel gibson ) , famous flying rooster and circus performer . \nrocky promises to teach the chickens how to fly , and the situation grows more desperate as the nefarious mrs . tweedy ( miranda richardson ) decides to abandon the farm\'s egg-selling plan for a pie-selling plan . \nshe orders a huge pie-making machine , cackling , " chickens go in ; pies come out . " \nwhat sort of pies ? \nchicken pies , of course . \nco-director nick park and his studio , aardman animation , produced three oscar-winning short films : " creature comforts , " " the wrong trousers , " and " a close shave . " \nthe latter two were the second and third installments of a trilogy starring the delightful team wallace & gromit , a man and his dog famous for getting themselves into increasingly peculiar adventures . \npart of what made the " wallace " films brilliantly entertaining was park\'s uncanny ability to make an old story seem new . \n " the wrong trousers , " for example , pulled out a lot of old hitchcockian suspense tricks , and " a close shave " owed a great deal of inspiration to classic detective stories . \nbut in the hands of park and his team , the stories felt fresh and inspired , and not the least bit contrived . \nnow teaming with co-director peter lord , park has created a similar creature in " chicken run . " \nthe plot is largely lifted from " the great escape " ( watch for a quick reference to the ball-bouncing scene ) , with some spielberg-inspired action sequences providing the excitement . \nthing is , the film doesn\'t feel like it\'s been lifted from somewhere else ; " chicken run " feels fresh , alive , like nothing else ever done . \npart of it is the original idea : who in the world came up with the idea to make a prisoners-of-war movie starring chickens ? \npark and lord milk the incongruity for all it\'s worth : the characters treat their situation as if it were dead serious , and to them , it is . \nbut they\'re * chickens * , so it\'s funny to us . \nanother thing that helps " chicken run " ( and most of park\'s films ) succeed is the animators\' subtle way of giving nods to the very films they\'re cribbing . \nthere are references to " the great escape , " obviously , and " stalag 17 . " \nthe chase sequence inside the chicken grinder parodies both " raiders of the lost ark " and " indiana jones and the temple of doom . " \nthere\'s even a nod to " the blues brothers , " if you can believe it , and the filmmakers even get in a little light-hearted ribbing at the expense of their star voice actor , taking a couple of very subtle jabs at mel gibson\'s nationality and film history . \nthe " braveheart " reference is a hoot if you catch it . \nwhen you consider just how fine a line resides between a funny parody and a redundant one , " chicken run " is downright brilliant in its execution . \nconsider this : since july of last year , i\'ve seen dozens upon dozens of " blair witch project " parodies , on television , on the radio , and online . \nout of all those , maybe one or two were amusing , the rest tiresome . \nwhy ? \nbecause once you got past the thought that " they\'re parodying that \'blair witch\' movie , " most of the parodies had nothing to offer  -  no insight , no original idea , no greater purpose . \nthey were only funny if you had seen the movie ; otherwise , you got nothing . \nconversely , " chicken run " doesn\'t lean on its parodies  -  the film references are never the sole focus of any scene . \nif you\'ve never seen " indiana jones , " the chase sequence is still fun . \neven if you have no idea who mel gibson is , rocky remains an interesting character . \npark , lord , and screenwriter karey kirkpatrick realize that an audience does not need a parody shoved in their faces before it\'s understood , and as a result , " chicken run " is filled with visual and verbal jabs that are never too obvious , but not quite vague either . \nenjoyment of this movie doesn\'t require knowledge of film history , but if you\'ve got it , " chicken run " is that much richer . \nthe voice cast turns in uniformly lively performances , and the characters they play are given so many subtle nuances that it\'s hard not to fall in love with every one of them . \nginger is perfect , spunky and opinionated , but with a soft heart that forbids her to leave her companions behind , and julia sawalha ( known as the cute mousy girl on " absolutely fabulous " ) nails it . \nshe has warm , human chemistry with gibson , whose rocky hides his personal doubt under a brash gung-ho veneer . \nthe characters\' inevitable romance doesn\'t feel contrived , but sweet . \npoor mr . tweedy ( tony haygarth ) suspects the chickens are organizing in some way , but his limited intellect prevents him from figuring things out , and his overbearing wife certainly isn\'t any help . \njane horrocks delivers a lovely voice characterization as tragically optimistic babs ( a hen that runs away with most of the best one-liners , all while perpetually crocheting a sweater ) , and two supply-trading rats that seem to have walked straight out of a monty python sketch nearly steal the show during the lively central swing-dancing sequence . \nthey even come up with an inspired riff on something that\'s baffled scholars and theologians alike for decades : the chicken vs . egg dilemma . \none might have expected this inevitable joke to come off as wearisome , but as with most of " chicken run , " it comes as a delightful surprise . \nthe film manages to cross all barriers ; it should be accessible to both children and adults , brits and yanks , rats and chickens . \nthe animation is first-rate . \nremember that this was all done the old-fashioned way , with actual humans moving clay figurines around on a tiny set bit by painstaking bit , and you see just how truly remarkable " chicken run " really is . \nthis movie possesses that same quality that makes pixar studios\' animation great : precise and unwavering attention to detail . \nas in such top-notch family fare as " a bug\'s life " or " toy story 2 , " " chicken run " has something to offer in nearly every frame . \ncertainly a passive viewer can enjoy it as a straightforward story with a worthwhile moral and some surprisingly touching scenes , but a active , attentive viewers will enjoy it even more because they\'ll catch all the details . \nanyway , the movie\'s just a boatload of fun . \nfunnest damn movie i\'ve seen all year . '

# Find the index of the most negative/least positive probability:
least_positive_index  = np.argsort(predicted_prob_nb_pos)[0]
least_positive_index

48

predicted_prob_nb[least_positive_index]

array([1.00000000e+00, 4.85161314e-39])

y_test.iloc[least_positive_index]

0

test_data.iloc[least_positive_index]

'it rocks-actually , lots of rocks fly at us or from us , in slow or fast motion , at several points in the film . \nthey seem like dangerous rocks because they kind of twirl through the air instead of just propelling forward , and when they land-once in a while , when we need a break from the space sequences-they cause damage enough to destroy the chrysler building and the like . \n ( nary a mention of these apocalyptic events is made after they occur . ) \nthey also just might be the most interesting element of armageddon , a steroid user\'s answer to deep impact . \nbruce willis stars as harry stamper , a famed oil-driller commissioned by the white house and nasa to stop a giant asteroid before it travels beyond " zero barrier " and destroys our planet . \nwhy an oil driller ? \nthey require someone experience with deep-core mining to plant a nuclear missile into said asteroid . \n ( in one unintentionally ( ? ) \nhilarious sequence , nasa asks harry to inspect a deep-core driller they built based on his blueprints ; it is poorly constructed-harry criticizes almost every aspect of it . \nwe trust nasa to build space shuttles ? ) \nharry assembles the obligatory " ragtag " bunch of " cowboys " , including a blond guy , a fat guy , a black guy , a wiseass , and the man who is sleeping with his daughter ( affleck ) . \nonce they reach space , we experience sequence after sequence of something going wrong-perhaps the fact that they sent a bunch of nincompoops into outer space has something to do with it ; i cannot count the number of times they almost fail the mission on all my fingers and toes . \nwhether or not they save the day , i will not reveal . \nnor will you care . \ni will say this : you know you\'re in trouble when deep impact dwarfs your asteroid movie in terms of emotion and scope . \nwillis has barely a chance to come alive ; ditto for affleck . \ntheir big scenes are mostly reserved for the third act , in a last minute-and futile-attempt to inject warmth into the material . \nsteve buscemi\'s character-the wiseass-is exceptionally problematic . \n " rockhound " , as he\'s called , is sarcastic and foolish , so they tape him to a chair , where he spends most of the film . \nso why did they bring him up there to begin with ? \nrather , why write him into the film ? -give \nhis almost-witty one-liners to serious willis , who scowls and mopes and demonstrates psychotic tendencies : at one point he chases after affleck with his shotgun for screwing his daughter , firing often and causing significant damage to his oil rig . \ni\'m guessing he qualifies under nasa guidelines as someone unfit for space travel , at least in my world where the sky is blue . \nliv tyler is pretty and humourless , as always ; suspiciously , four of her father\'s band\'s ( " aerosmith " ) songs grace the soundtrack . \ndirector michael bay lays the visual and sound effects on thick , like ketchup , eventually drowning the movie on-screen . \n ( the middle hour is a non-sensical , pyrotechnic assault on the average primate\'s brain . ) \nwhenever someone dies in this movie , a crew member inevitably yells out " we lost ( insert dead person\'s last name here ) ! " \ni must admit that not once could i distinguish a dead oil-guy-cum-astronaut from a live one , and close-ups of the corpses\' faces beneath cracked helmets provided little assistance , as their skin was often covered in fake blood . \narmageddon is not as terrible movie as godzilla . \nit looks nicer , and has fewer plot-holes within its equally ludicrous framework . \nit has a vivid soundmix . \nbut at almost two-and-a-half hours , i could not believe how little actually happened over the course of the story . \nthe love story has been played up in the ads , perhaps hoping to catch people before they recover from titanic-fever . \nbollocks ! \nthe lovers in the film are miles apart throughout-erase all thoughts of nude sketching or car-sex and replace them with obligatory shots of liv tyler tearing up while ben affleck dicks around in a moon-crawler . \nremember a little film called jaws ? \nin this film , three independent-minded men suddenly found themselves on a fishing boat in pursuit of a deadly shark . \nthey didn\'t much like each other at first ; eventually , they started to respect one another . \none of jaws\' great scenes involved the would-be-ahabs drinking and singing songs and telling stories . \nthis is the sort of male-bonding foreign to bay or his producer , jerry bruckheimer , who throw too many characters into the mix and expect we\'ll care about them on the grounds that the world is about to end . \nnot once do we get the feeling that these characters are even acquaintances-i\'d be surprised if these actors bothered to introduce themselves to one another before " action " was called . \na male friend who loved the film suggested to me that perhaps i cannot relate to a bunch of men who don\'t bare their souls , who believe in dying macho concepts like heroism and a kind of chest-beating bravery . \nto this , i will respond that the boys in armageddon are neither heroic , nor brave , nor smart , even : this team couldn\'t build a birdhouse . \nand if i get no respect for disliking a movie with all the synthetic feeling of a trailer-a trailer for a movie written by a team of body-builders and greeting card authors-i\'ve never been a prouder wimp my whole life . '

# The index of the probability that is closest to 0.5 for the "positive" class.
most_confused_index = np.argsort(np.abs(predicted_prob_nb_neg - 0.5))[0]

predicted_prob_nb[most_confused_index]

array([0.52331012, 0.47668988])

y_test.iloc[most_confused_index]

1

test_data.iloc[most_confused_index] ##

"review- peter jackson's the frighteners has received some notice for setting the record for most computer effects ever in a movie , and still coming in at the extremely cheap $30 million price tag . \nbut for those who were dismayed by this year's blockbusters like twister and independence day , the frighteners has much more to offer than special effects . \nand for those worried wether or not peter jackson would compromise to hollywood you can rest easily . \nthe frighteners is as far removed from hollywood as a high-profile movie can get . \nmichael j . fox stars as frank bannister , a con artist who can speak to ghosts . \nhe uses this ability to set up a scam in a small town where his ghost buddies scare the hell out of people , then he comes and pretends to get rid of them . \nthis is how he has made a living ever since his wife died in a car crash 5 years ago . \nfrank's latest customers are a young couple , played by trini alvarado and peter dobson . \nwhen dobson ends up dead , alvarado starts to take an interest in fox . \nbut dobson's spirit is still around as he refuses to beleive he's dead . \nthis leads to a very awkward and amusing dinner date between fox and alvarado , with dobson tagging along as a ghost . \nthings start getting complicated for fox when he is accused for a series of murders taking place in the town . \nfox sees someone named the soul collector crushing the heart of the victims , but noone else can see that . \nso when fox shows up to try and save each victim , naturally people suspect he is the killer . \nfox sees that alvarado is next on the soul collector's hit list , and the last half hour of the movie deals with fox's attempts to save her from this evil spirit . \nthere are many wonderful twists and turns in the screenplay written by peter jackson and frances walsh . \nthe movie starts off as a black comedy , and ends up a horror-action film . \nthe mix between these genres are perfect . \nno laughs are sacrificed in the name of horror , and vice versa . \none point of contention might be a lackluster score by danny elfman . \nbut that hardly seems like a flaw when you have such a diverse cast all in top form . \nmichael j . fox delivers one of his best performances to date as a man who hides the sorrow of his wife's death , and then is forced to confront this later on . \nalvarado , looking like andie macdowell , makes a great frightened , tough , and smart heroine . \nand jeffery combs , as a paranoid fbi agent , is brilliantly bizaare . \nthe frighteners never once feels like it is running long . \nthe first hour is as funny as any comedy this year , and the last half hour is as thrilling as any of the big budget blockbusters . \nthis movie is probably what casper would've looked like if david lynch directed it . \nit's easily the best film of the year , so far . "

clearly_wrong = np.argsort(predicted_prob_nb_neg[:50])[0]

predicted_prob_nb[clearly_wrong]

array([3.07634031e-38, 1.00000000e+00])

y_test.iloc[clearly_wrong]

0

test_data.iloc[clearly_wrong] ## Yup thats a clear mistake. overfitting?

'the following review encompasses two versions of dune : \ndune : the theatrical version ( 1984 ) runtime : 137 minutes capsule review : cut down to just over two hours by nervous studio executives , the theatrical version of dune is a spectacular mess and may be incomprehensible to those unfamiliar with the book . \nthe film\'s visual splendour , mystical beauty and impressive action scenes only partly compensate for gaping holes in the narrative . \ndune : the extended version ( 1988 ) runtime : 189 minutes capsule review : a bit of a throw-together assembled by mca tv special projects for cable television . \nit was disowned by director david lynch but it\'s considerably closer to his original vision by virtue of its improved characterisation and clearer storyline . \nquality dubs of this version from the out-of-print japanese laserdisc release are available from various dealers on the world wide web . \n * * * the review * * * \nreleased in 1984 and made on a then mammoth budget of $40 million , the film of frank herbert\'s cult novel dune was eagerly awaited by sci-fi fans . \ndirector david lynch ( blue velvet , eraserhead , twin peaks ) was working on his biggest production to date , a mammoth undertaking filmed under trying conditions on location in mexico . \nthe screenplay was lynch\'s own , chosen after the script submitted by original author herbert was rejected . \ndune is set in a universe ruled by powerful families overseen by a successive line of emperors . \nthe key to cosmic power is the planet arrakis ( dune ) , a windswept desert planet that\'s home to giant sandworms and the precious spice melange . \nthe spice is the most valuable commodity in the universe . \nit extends the life and expands the consciousness of those who consume it . \nmost importantly , it allows the navigators of the spacing guild ( once human but now hideously mutated ) to " fold space " and navigate their spacecraft across mammoth distances instantaneously , enabling interstellar commerce and trade to flourish . \nlynch\'s film by necessity excises parts of the book while retaining the story\'s two main strands . \none is the long-standing rivalry between two families , houses atreides and house harkonnen , and their battle for lucrative mining rights on arrakis . \nthe second strand is the emergence of young paul atreides as the reluctant messiah long-awaited by the natives of arrakis , the fremen . \nthe deeply religious fremen want control over their homeworld , and young paul may be the fulfilment of their prophecy that a man would come from the outer worlds and lead them to freedom . \nunfortunately , this epic story unfolds in a confusing and haphazard manner in the theatrical cut of the film , which runs 30 to 60 minutes shorter than what lynch originally intended . \nthe thinking among universal\'s oh-so-wise money men was that films over two hours in duration were not popular with audiences at the time and would not do well at the box office . \nwith lynch\'s initial cut running at closer to three or more hours , the studio demanded that further cuts be made . \nwhat a great idea ! \nwhy not trim down an already complex film so as to make it almost incomprehensible ? \nthe most glaring consequence of this one-eyed stupidity is a hopelessly jumpy narrative , leaving us with badly underdeveloped characters . \nthus their personalities are vague , their motivations unclear and , in the case of paul\'s father duke leto , their demise rather meaningless . \nthe end result is a distinct chill : we can\'t warm to most of the cast and we don\'t care much for them . \nand it hardly helps that the voice-over narration is sparse and that the dune-esque language and terminology sounds like so much gobbledegook to those unfamiliar with the book . \ndune is also a very serous film . \nthe constant " self-talk " by various characters makes it so serious and self-absorbed at times that you may find it hard not to wince with embarrassment . \nthe overall impression is a world full of people so intense that no one is allowed a joke lest the universe come crashing down around them . \nhumour - or at least a gentle kind of humour as distinct from the harkonnen\'s mad , sadistic kind - is hard to find . \nyou may balk at the comparison , but as a writer lynch could well have done with some lessons from george lucus\' star wars trilogy . \nthe theatrical version is still some way from being a complete disaster , however . \nit still possesses enough of lynch\'s stylistic quirks and enough visual invention to sustain the interest of viewers with a taste for imaginative sci-fi . \nspecial effects whiz carlo rambaldi\'s giant sandworms are an awesome sight . \nboth the production design ( anthony masters ) and costume design ( bob ringwood ) are striking and original . \nand the magnificent score by toto and brian eno is one of the most underrated soundtracks of the last twenty years . \nwith these elements in place and the benefit of freddie francis\' lush cinematography , the film is at least a feast for the senses . \nsee it in the widescreen format if you can . \nand despite all the cuts , several cast members still make a strong impression , most notably kenneth mcmillan as the supremely nasty baron vladimir harkonnen . \nsian phillips also registers strongly as the reverend mother gaius helen mohiam , leader of the bene gesserit religious order who\'s secret aim is to manipulate paul\'s destiny for its own shadowy ends . \nas paul atreides , the young kyle maclachlan starts off somewhat shakily , but as his character grows in strength so does his performance and he emerges as a credible leader of the fremen crusade . \nthe conclusion ? \nany assessment of this film must take into account that frank herbert\'s original novel is a complex piece of work and presents a tough challenge for any filmmaker . \ndavid lynch took a brave stab at it and , partly due to forces beyond his control , ended up with an officially released version that fails in several key respects . \ndune certainly confused and frustrated a lot of people on its release . \nmany chose to stay away altogether , as the film\'s disastrous box office showing attests . \nthe extended version , however , is a rather different beast . \nin 1984 lynch stated his intention to release his own special edition " director\'s cut " of the film on home video , a clear indication of his dissatisfaction with the version that ended up in the theatres . \nbut , alas , he failed to do so , choosing to move on to other projects . \nin a way , then , it is partly lynch\'s own fault that what appeared instead was an unauthorised extended version , put together in 1988 by mca tv special projects for airing on cable networks in the usa . \nstung into action , lynch successfully petitioned the director\'s guild to take his name off the credits and replace it with " allen smithee " , the standard pseudonym for directors who wish to disown their own work . \nhe also had the screenwriting credit changed to the anonymous " judas booth " . \ncertainly , looking at the results of mca\'s handiwork there\'s at least half a dozen instances that , for sheer technical sloppiness , are good enough reasons for the director to object . \nbut these gripes must be considered in light of the improvements that the extended cut of dune offers in several crucial areas . \nmost of the changes involve the restoration or extension of cut scenes and the addition of extra narration , both of which fill many holes in the original version\'s storyline . \npaul\'s relationship with his father and associates is more intimate , with moments of humour and warmth lacking previously . \nthe political skulduggery involving the emperor , the spacing guild , the bene gesserits and the two warring houses is far better explained . \npaul\'s initiation into the fremen way of life on arrakis is also fleshed out considerably . \nand as further background , a new prologue has been added featuring narration and painted stills to give us a brief history of the dune universe . \nas a piece of storytelling , then , mca tv\'s version of dune is clearly superior . \nas a piece of editing , however , it is at times surprisingly inept . \nthe use of painted stills in the new prologue works well enough , but their occasional appearance once the action begins is inappropriate . \nthere\'s some sloppy cutting , too , and in a few instances shots even appear out of order . \nand the use of repeated footage to fabricate certain scenes ( eg . \nships coming and going , soldiers coming and going ) is at times clearly out-of-context . \nthis is the kind of thing to which lynch objected , and rightly so . \nit should also be noted that several questionable scenes and shots from the theatrical version were deleted to satisfy the censorship demands of u . s . television . \nbut the most notable omission is a gratuitous piece of nonsense from lynch that wasn\'t even in herbert\'s book . \nthe scene features baron harkonnen killing a beautiful young man in front of his slobbering henchmen by pulling out his " heart plug " . \nits a surreal and disturbing episode that\'s very lynch-esque but adds nothing to what we already know : the baron is a nasty piece of work . \ndespite its own peculiar flaws , then , the extended version of dune is a generally superior film . \nall up , it contains 35 minutes of restored footage and approximately another 15 minutes of either altered , fabricated or newly created sequences . \nunless the idiosyncratic lynch has a sudden change of heart , the " alan smithee " version remains the closest we\'ll get to what the movie should have been . \non repeated viewings , one suspects it is closer than what lynch would be prepared to admit . \nstill , as one of this century\'s great science-fiction novels , some fans and perhaps the late herbert himself would argue that dune deserved a better fate in its transfer to the screen . \nwith rumours circulating of a new six hour mini-series planned by production company new amsterdam entertainment in 1998 , it is unlikely that we have heard the last of the dune saga . '

# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


# train_data, test_data, y_train, y_test
# Initialize the SVM model
classifier_svm = SVC(kernel='linear', C= 0.1, probability=True)  # Use linear kernel. You can also try other kernels like 'rbf', 'poly', etc.

# Fit the model to the training data
classifier_svm.fit(X_train_binary, y_train)

SVC(C=0.1, kernel='linear', probability=True)

SVC(C=0.1, kernel='linear', probability=True)

# Predicttions - predict the labels for the test data
predicted_proba_svm = classifier_svm.predict_proba(X_test_binary)
predictions_svm = classifier_svm.predict(X_test_binary)

# Evaluate the model
accuracy_svm = accuracy_score(y_test, predictions_svm)
report_svm = classification_report(y_test, predictions_svm)
print("Accuracy:", accuracy_svm)
print("Summary", report_svm)

Accuracy: 0.84
Summary               precision    recall  f1-score   support

           0       0.83      0.85      0.84       199
           1       0.85      0.83      0.84       201

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

# unique classes
print("Unique classes:", classifier_svm.classes_)

# weight vector or coefficient vector (tells us the weights assigned to each feature)
# larger the weight, larger the importance
print("Weight or coefficient vector:", classifier_svm.coef_.toarray())

# number of support vectors for each class
print("Num of support vectors for each class:", classifier_svm.n_support_)

# tot number of support vectors
num_support = sum(classifier_svm.n_support_)
print(f"Total num of support vectors:, {num_support}, This means our SVM model has identified {num_support} data points from the training set as crucial for defining the decision boundary.")

# index of the support vectors 
print("Indices of the support vectors:", classifier_svm.support_)

# Lagrange multiplier.y_i associated with each support vector
# This gets used when calculating the weight vector!
print("Lagrange multiplier associated with each support vector:", classifier_svm.dual_coef_.toarray())

Unique classes: [0 1]
Weight or coefficient vector: [[ 0.0075401  -0.00048252 -0.00550084 ...  0.          0.00062448
   0.        ]]
Num of support vectors for each class: [581 578]
Total num of support vectors:, 1159, This means our SVM model has identified 1159 data points from the training set as crucial for defining the decision boundary.
Indices of the support vectors: [   1    2    3 ... 1595 1598 1599]
Lagrange multiplier associated with each support vector: [[-0.01132617 -0.00020885 -0.00337526 ...  0.004575    0.00079988
   0.00158499]]

# dimention of our document term matrix: 
X_train_binary.shape
# There are 41991 features

(1600, 41991)

w_direct = classifier_svm.coef_
w_direct.shape # we have 41991 weights corresponding to each feature/token

(1, 41991)

X_train_binary[classifier_svm.support_].shape # there are 1159 support vectors and 41991 features

(1159, 41991)

classifier_svm.dual_coef_.shape # There are 1159 alphi_i, one for every support vector

(1, 1159)

w_calc = np.dot(classifier_svm.dual_coef_, X_train_binary[classifier_svm.support_]) # This dot product will be 1 x 41991

np.unique((w_direct == w_calc).toarray(), return_counts=True)

/opt/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3553: SparseEfficiencyWarning: Comparing sparse matrices using == is inefficient, try using != instead.
  exec(code_obj, self.user_global_ns, self.user_ns)

(array([ True]), array([41991]))

# Lets sort the coefficients in ascending order 
sorted_coefs = np.argsort(classifier_svm.coef_.toarray()[0])

# retrieves the top 30 words from the vocabulary list that are most indicative of the postive class 
# (according to their associated coefficients in the SVM model.)
print(vocab_list[sorted_coefs][-1:-31:-1])
print(classifier_svm.coef_.toarray()[0][sorted_coefs][-1:-31:-1])

['others' 'as' 'very' 'pace' 'memorable' 'change' 'overall' 'town'
 'excellent' 'performances' 'enjoy' 'true' 'well' 'perfectly'
 'wonderfully' 'hilarious' 'sometimes' 'american' 'terrific' 'especially'
 'simple' 'back' 'laughs' 'flaws' 'most' 'that' 'many' 'entertaining'
 'great' 'summer']
[0.13532363 0.12340093 0.11500559 0.11357976 0.11082435 0.10790578
 0.10685244 0.10551024 0.10344589 0.10037851 0.09831883 0.09642252
 0.09472701 0.09466765 0.0943991  0.09433532 0.09327348 0.0923903
 0.09225329 0.0896015  0.08888229 0.08853742 0.08787216 0.08728341
 0.08435347 0.08425743 0.08228719 0.08154667 0.08082275 0.07899405]

# retrieves the top 30 words from the vocabulary list that are most indicative of the negative class 
# (according to their associated coefficients in the SVM model.)
print(vocab_list[sorted_coefs][:30])
print(classifier_svm.coef_.toarray()[0][sorted_coefs][:30])

['bad' 'worst' 'waste' 'nothing' 'plot' 'have' 'falls' 'script' 'poor'
 'unfortunately' 'lame' 'awful' 'wasted' 'supposed' 'terrible' 'should'
 'ridiculous' 'only' 'mess' 'stupid' 'boring' 'reason' 'potential'
 'pointless' 'neither' 'flat' 'cheap' '2' 'career' 'maybe']
[-0.15303128 -0.15242166 -0.14769221 -0.14370069 -0.13460148 -0.12386702
 -0.122686   -0.11391206 -0.11325014 -0.11103984 -0.11002238 -0.10935591
 -0.1069662  -0.10439135 -0.10412101 -0.10249379 -0.09819262 -0.09687126
 -0.0967286  -0.0943604  -0.09335848 -0.0916916  -0.0863012  -0.08543421
 -0.08509572 -0.08165018 -0.08156772 -0.07970779 -0.07966579 -0.07944191]

## Lets plot the words based on these coefficients:
import numpy as np
import matplotlib.pyplot as plt

# Assuming dfmat_train contains your training data matrix and beta_svm contains the coefficients

# Compute column sums of dfmat_train
col_sums = np.sum(X_train_binary, axis=0).tolist()[0]
w = classifier_svm.coef_.toarray().tolist()[0]

plt.figure(figsize=(12, 10))

# Plot the coefficients against column sums
plt.scatter(col_sums, w, marker='o', color='black', alpha=0.2, s=20)

# Set plot properties
plt.xscale('log')
plt.xlabel('Total Appearances')
plt.ylabel('<--- Negative Reviews --- Positive Reviews --->')
plt.title('Support Vector Machine Coefficients (Linear Kernel), IMDB')

# Add text annotations
for i, txt in enumerate(vocab_list):
    plt.text(col_sums[i], w[i], txt, fontsize=90*abs(w[i]), color='black')

# Set x-axis limit
plt.xlim(1,50000)

# Show the plot
plt.show()

/Users/gopali/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 5 () missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
/Users/gopali/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 19 () missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
/Users/gopali/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 20 () missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
/Users/gopali/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 22 () missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
/Users/gopali/anaconda3/lib/python3.11/site-packages/IPython/core/pylabtools.py:152: UserWarning: Glyph 18 () missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

from sklearn.model_selection import GridSearchCV
# Define the parameter grid for grid search
# Low to High regularization cost (C) 
tune_param = {'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000, 10000, 100000]}
print(tune_param)
# Define GridSearchCV with SVM classifier, parameter grid, and 5-fold cross-validation
grid_search_svm = GridSearchCV(SVC(kernel='linear'), tune_param, cv = 5, scoring='accuracy', verbose=3)
grid_search_svm.fit(X_train_binary, y_train)

{'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000, 10000, 100000]}
Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ..........................C=0.0001;, score=0.644 total time=   3.1s
[CV 2/5] END ..........................C=0.0001;, score=0.500 total time=   3.0s
[CV 3/5] END ..........................C=0.0001;, score=0.500 total time=   3.0s
[CV 4/5] END ..........................C=0.0001;, score=0.500 total time=   3.0s
[CV 5/5] END ..........................C=0.0001;, score=0.500 total time=   3.0s
[CV 1/5] END ...........................C=0.001;, score=0.831 total time=   2.9s
[CV 2/5] END ...........................C=0.001;, score=0.856 total time=   2.9s
[CV 3/5] END ...........................C=0.001;, score=0.822 total time=   2.8s
[CV 4/5] END ...........................C=0.001;, score=0.850 total time=   2.9s
[CV 5/5] END ...........................C=0.001;, score=0.791 total time=   2.8s
[CV 1/5] END ............................C=0.01;, score=0.831 total time=   2.7s
[CV 2/5] END ............................C=0.01;, score=0.872 total time=   2.8s
[CV 3/5] END ............................C=0.01;, score=0.853 total time=   2.7s
[CV 4/5] END ............................C=0.01;, score=0.869 total time=   2.7s
[CV 5/5] END ............................C=0.01;, score=0.825 total time=   2.7s
[CV 1/5] END .............................C=0.1;, score=0.831 total time=   2.7s
[CV 2/5] END .............................C=0.1;, score=0.872 total time=   2.7s
[CV 3/5] END .............................C=0.1;, score=0.863 total time=   2.7s
[CV 4/5] END .............................C=0.1;, score=0.872 total time=   2.7s
[CV 5/5] END .............................C=0.1;, score=0.828 total time=   2.7s
[CV 1/5] END .............................C=0.5;, score=0.831 total time=   2.7s
[CV 2/5] END .............................C=0.5;, score=0.872 total time=   2.7s
[CV 3/5] END .............................C=0.5;, score=0.863 total time=   2.7s
[CV 4/5] END .............................C=0.5;, score=0.872 total time=   2.7s
[CV 5/5] END .............................C=0.5;, score=0.828 total time=   2.7s
[CV 1/5] END ...............................C=1;, score=0.831 total time=   2.8s
[CV 2/5] END ...............................C=1;, score=0.872 total time=   2.7s
[CV 3/5] END ...............................C=1;, score=0.863 total time=   2.7s
[CV 4/5] END ...............................C=1;, score=0.872 total time=   2.7s
[CV 5/5] END ...............................C=1;, score=0.828 total time=   2.7s
[CV 1/5] END ..............................C=10;, score=0.831 total time=   2.7s
[CV 2/5] END ..............................C=10;, score=0.872 total time=   2.7s
[CV 3/5] END ..............................C=10;, score=0.863 total time=   2.7s
[CV 4/5] END ..............................C=10;, score=0.872 total time=   2.7s
[CV 5/5] END ..............................C=10;, score=0.828 total time=   2.7s
[CV 1/5] END .............................C=100;, score=0.831 total time=   2.8s
[CV 2/5] END .............................C=100;, score=0.872 total time=   2.7s
[CV 3/5] END .............................C=100;, score=0.863 total time=   2.7s
[CV 4/5] END .............................C=100;, score=0.872 total time=   2.8s
[CV 5/5] END .............................C=100;, score=0.828 total time=   2.7s
[CV 1/5] END ............................C=1000;, score=0.831 total time=   2.8s
[CV 2/5] END ............................C=1000;, score=0.872 total time=   2.8s
[CV 3/5] END ............................C=1000;, score=0.863 total time=   2.7s
[CV 4/5] END ............................C=1000;, score=0.872 total time=   2.8s
[CV 5/5] END ............................C=1000;, score=0.828 total time=   2.7s
[CV 1/5] END ...........................C=10000;, score=0.831 total time=   2.8s
[CV 2/5] END ...........................C=10000;, score=0.872 total time=   2.7s
[CV 3/5] END ...........................C=10000;, score=0.863 total time=   2.7s
[CV 4/5] END ...........................C=10000;, score=0.872 total time=   2.8s
[CV 5/5] END ...........................C=10000;, score=0.828 total time=   2.7s
[CV 1/5] END ..........................C=100000;, score=0.831 total time=   2.8s
[CV 2/5] END ..........................C=100000;, score=0.872 total time=   2.7s
[CV 3/5] END ..........................C=100000;, score=0.863 total time=   2.7s
[CV 4/5] END ..........................C=100000;, score=0.872 total time=   2.8s
[CV 5/5] END ..........................C=100000;, score=0.828 total time=   2.7s

GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000,
                               10000, 100000]},
             scoring='accuracy', verbose=3)

GridSearchCV(cv=5, estimator=SVC(kernel='linear'),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000,
                               10000, 100000]},
             scoring='accuracy', verbose=3)

SVC(kernel='linear')

SVC(kernel='linear')

# Print the best hyperparameters
# We used the best C above 
print("Best hyperparameters:", grid_search_svm.best_params_)

# Print the best cross-validation score
print("Best cross-validation score:", grid_search_svm.best_score_)

Best hyperparameters: {'C': 0.1}
Best cross-validation score: 0.853125

# Extract mean cross-validation scores and corresponding values of C
mean_scores = grid_search_svm.cv_results_['mean_test_score']
C_values = tune_param['C']

# Plot the graph
plt.figure(figsize=(8, 6))
plt.plot(np.log10(C_values), mean_scores, marker='o', linestyle='-')
plt.title('Mean Cross-Validation Score vs. Log10(C)')
plt.xlabel('Log10(C)')
plt.ylabel('Mean Cross-Validation Score')
plt.grid(True)
plt.xticks(np.log10(C_values), [f"1e{x:.0f}" for x in np.log10(C_values)])
plt.show()

# Predict using the best estimator found by grid search
y_pred_svm_grid = grid_search_svm.best_estimator_.predict(X_test_dtm)

# Calculate and print the accuracy of the best estimator on the testing data
accuracy_svm_grid = accuracy_score(y_test, y_pred_svm_grid)
print("Accuracy:", accuracy_svm_grid)

Accuracy: 0.84

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

# Set seed
np.random.seed(1234)

# Use TfidfVectorizer for text vectorization
# min_df=50: minimum frequency threshold for a token to be included in the vocabulary. 
rf_vectorizer = CountVectorizer(min_df=50, max_df=300, binary=True)

# Convert the text data into a document-term matrix
rf_X_train = rf_vectorizer.fit_transform(train_data)
rf_X_test = rf_vectorizer.transform(test_data)
rf_vocab_list = rf_vectorizer.get_feature_names_out()

# The earlier models used 41991 features 
X_train_binary

<1600x41991 sparse matrix of type '<class 'numpy.int64'>'
	with 547236 stored elements in Compressed Sparse Row format>

# The random forest will use only 1734 features 
rf_X_train

<1600x1511 sparse matrix of type '<class 'numpy.int64'>'
	with 164283 stored elements in Compressed Sparse Row format>

# Create and train the Random Forest model
np.random.seed(1234)
classifier_rf = RandomForestClassifier(n_estimators=100, max_features=20)
classifier_rf.fit(rf_X_train, y_train)

RandomForestClassifier(max_features=20)

RandomForestClassifier(max_features=20)

# Predicttions - predict the labels for the test data
predicted_proba_rf = classifier_rf.predict_proba(rf_X_test)
predictions_rf = classifier_rf.predict(rf_X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, predictions_rf)
report_rf = classification_report(y_test, predictions_rf)
print("Accuracy:", accuracy_rf)
print("Summary", report_rf)

Accuracy: 0.8225
Summary               precision    recall  f1-score   support

           0       0.82      0.82      0.82       199
           1       0.82      0.83      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400

# Evaluate the model
accuracy_train_rf = classifier_rf.score(rf_X_train, y_train)
print("Accuracy training set:", accuracy_train_rf)

accuracy_test_rf = classifier_rf.score(rf_X_test, y_test)
print("Accuracy test set:", accuracy_test_rf)

Accuracy training set: 1.0
Accuracy test set: 0.8225

mdi_importances = pd.Series(
    classifier_rf.feature_importances_, index=rf_vocab_list
)

ax = mdi_importances.sort_values(ascending=True)[-30:].plot.barh()
ax.set_title("Random Forest Feature Importances (MDI)")
ax.figure.tight_layout()

top_word_indices = np.argsort(mdi_importances)[::-1][:30]
print(rf_vocab_list[top_word_indices.values])
print(mdi_importances.values[top_word_indices.values])

['worst' 'stupid' 'boring' 'mess' 'lame' 'perfectly' 'excellent'
 'ridiculous' 'performances' 'awful' 'effective' 'outstanding' 'perfect'
 'wasted' 'supposed' 'dull' 'allows' 'waste' 'wonderfully' 'memorable'
 'others' 'terrible' 'looks' 'subtle' 'maybe' 'overall' 'hilarious'
 'terrific' 'true' 'unfortunately']
[0.00721143 0.00609041 0.00603721 0.00499713 0.00485021 0.00437587
 0.00424904 0.00411215 0.00406847 0.00405618 0.00403702 0.00401124
 0.00388677 0.00378624 0.00378257 0.00369268 0.00359577 0.00345391
 0.00321271 0.00318997 0.0031568  0.00313495 0.00304673 0.00281922
 0.002758   0.00258836 0.00254304 0.00251487 0.00240151 0.00234664]

from sklearn.inspection import permutation_importance

# Measures the decrease in model performance when the values of each feature are randomly shuffled. 
# The analysis is repeated 10 to obtain estimates of feature importance. 
# The results include the mean importance, standard deviation, and confidence interval for each feature.
permute_importances = permutation_importance(
    classifier_rf, rf_X_test.toarray(), y_test, n_repeats=100, random_state=42, n_jobs=5, scoring="accuracy"
)

#sorted_idx = 
# Create a DataFrame from permutation importances
permute_importances_df = pd.DataFrame({
    'importances_mean': permute_importances.importances_mean,
    'importances_std': permute_importances.importances_std},
index = rf_vocab_list)
#permute_importances_df["word"] = rf_vocab_list
permute_importances_df = permute_importances_df.sort_values("importances_mean", ascending=False)
permute_importances_df.head()

import matplotlib.pyplot as plt
top_n = 30
fig, ax = plt.subplots()
permute_importances_df.importances_mean[:top_n].plot.bar(yerr=permute_importances_df.importances_std[:top_n], ax=ax)
ax.set_title("Random Forest Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

from sklearn.metrics import roc_curve, auc

# Assuming 'predicted_proba_svm', 'predicted_proba_rf', 'predicted_proba_naive', 'actual_class' are numpy arrays or pandas Series

# Calculate ROC curve and AUC for Naive Bayes
fpr_naive, tpr_naive, _ = roc_curve(y_test, predicted_prob_nb[:, 1])
roc_auc_naive = auc(fpr_naive, tpr_naive)

actual_class = y_test
# Calculate ROC curve and AUC for SVM
fpr_svm, tpr_svm, _ = roc_curve(y_test, predicted_proba_svm[:, 1])
roc_auc_svm = auc(fpr_svm, tpr_svm)

# Calculate ROC curve and AUC for Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, predicted_proba_rf[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curves
plt.figure(figsize=(8, 6))
plt.plot(fpr_naive, tpr_naive, color='green', lw=2, label=f'Naive Bayes (AUC = {roc_auc_naive:.2f})')
plt.plot(fpr_svm, tpr_svm, color='blue', lw=2, label=f'SVM (AUC = {roc_auc_svm:.2f})')
plt.plot(fpr_rf, tpr_rf, color='red', lw=2, label=f'Random Forest (AUC = {roc_auc_rf:.2f})')
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

	importances_mean	importances_std
worst	0.019750	0.006629
boring	0.018600	0.006096
mess	0.012975	0.005902
awful	0.012350	0.005301
dull	0.008875	0.004588

Text Classification in Python

The task:¶

Index:¶

Dataset¶

Create the doument term matrix¶

Binary Vectorizer¶

Count Vectorizer¶

Tf-idf Vectorizer¶

Naive Bayes¶

NB Fit:¶

NB Predictions:¶

NB Evaluate:¶

NB Feature importance:¶

Plot the feature importance:¶

Zoom into the positve words:¶

Zoom into the negative words:¶

What's the most positive review in the test set according to this?¶

What is the most negative review in the test set according to this?¶

What is it most confused about?¶

Support Vector Machine¶

SVM Fit:¶

SVM Predictions:¶

SVM Evalute:¶

Let's look inside the model:¶

Some math:¶

Let's use stuff returned by Python to prove that $\mathbf{w} = \sum_{i=1}^{n} \alpha_i y_i \mathbf{x_i} $¶

SVM Feature importance:¶

Grid Search to find C:¶

Random Forests¶

Document term matrix:¶

RF Fit:¶

RF Evaluate:¶

Feature Importance:¶

MDI importances:¶

Permutation importances (docs)¶

ROC¶

	text	class
0	plot : two teen couples go to a church party ,...	neg
1	the happy bastard's quick movie review \ndamn ...	neg

	text	class
1998	steven spielberg's second epic film on world w...	pos
1999	truman ( " true-man " ) burbank is the perfect...	pos