from helpers_cotton_candy import *
import_all()
import matplotlib.pyplot as plt
plt.style.use('cotton_candy.mplstyle')
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm as tq

%matplotlib inline


%%html
<style>
a:link {color: #4c1fbf !important; font-weight: 600 !important;}
a:visited {color: #4c1fbf !important; font-weight: 600 !important;}
</style>


# df = pd.read_csv('https://mydatabucky.s3.amazonaws.com/amazon_sentiment/Reviews.csv')


# df.to_csv('amazon_data.csv')


df = pd.read_csv('amazon_data.csv')


head_tail_vert(df['Text'], 3, 'Imported Data')


ax = df['Score'].value_counts()\
				.sort_index() \
				.plot(kind='barh',
					  color = 'C2',
					  title='Count of Reviews by Stars');
ax.set_ylabel('Review Stars Count');
ax.set_xlabel('Count');


example = df['Text'][50]


pretty(example, 'example')


# nltk.download('punkt')
tokens = nltk.word_tokenize(example)


pretty(tokens[:10], 'tokens[:10]')


# nltk.download('averaged_perceptron_tagger')
tagged = nltk.pos_tag(tokens)


pretty(tagged[:10], 'tagged[:10]')


# nltk.download('maxent_ne_chunker')
# nltk.download('words')
groups = nltk.chunk.ne_chunk(tagged)
groups.pprint()

(S
  This/DT
  oatmeal/NN
  is/VBZ
  not/RB
  good/JJ
  ./.
  Its/PRP$
  mushy/NN
  ,/,
  soft/JJ
  ,/,
  I/PRP
  do/VBP
  n't/RB
  like/VB
  it/PRP
  ./.
  (ORGANIZATION Quaker/NNP Oats/NNPS)
  is/VBZ
  the/DT
  way/NN
  to/TO
  go/VB
  ./.)


pretty(len(groups), 'len(groups)')


# nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores('What a great day!')

{'neg': 0.0, 'neu': 0.313, 'pos': 0.687, 'compound': 0.6588}


analyzer.polarity_scores('Your breath stinks!')

{'neg': 0.534, 'neu': 0.466, 'pos': 0.0, 'compound': -0.3164}


analyzer.polarity_scores('I do not care!')

{'neg': 0.593, 'neu': 0.407, 'pos': 0.0, 'compound': -0.4442}


pretty(example, 'example')
pretty(analyzer.polarity_scores(example),
								'analyzer.polarity_scores(example)')


pretty(f"{len(df):,}", 'Length of entire dataframe:')

smaller_df = df[0:11111]


pretty(f"{len(smaller_df):,}", 'Length of smaller dataframe:')


polarity_data = {}
for idx, row in tq(smaller_df.iterrows()):
	text_data = row['Text']
	id_label = row['Id']
	polarity_data[id_label] = analyzer.polarity_scores(text_data)
	break
polarity_data

{1: {'neg': 0.0, 'neu': 0.695, 'pos': 0.305, 'compound': 0.9441}}


multi([(pd.DataFrame(polarity_data), 'polarity_data df'),
	  (pd.DataFrame(polarity_data).T, 'polarity_data df transposed')])


def polarity_scores(df, 
					text_col, 
					id_col):

	from nltk.sentiment import SentimentIntensityAnalyzer
	from tqdm.notebook import tqdm as tq
	
	analyzier = SentimentIntensityAnalyzer()
	polarity_data = {}
	for idx, row in tq(df.iterrows(), total=len(df)):
		text_data = row[text_col]
		id_label = row[id_col]
		polarity_data[id_label] = analyzer.polarity_scores(text_data)
		
	vaders = pd.DataFrame(polarity_data).T
	vaders = vaders.reset_index().rename(columns={'index': 'Id'})
	vaders = vaders.merge(df, how='left')
		
	return vaders


df_sentiments = polarity_scores(smaller_df, 'Text', 'Id')


df_sentiments.to_csv('df_sentiments.csv')


df_sentiments = pd.read_csv('df_sentiments.csv')


df_sentiments.drop(columns = ['Unnamed: 0'], 
				   inplace = True)

head_tail_vert(df_sentiments[['neg', 'neu', 'pos', 'compound', 'Score', 'Text']], 
			   3, 'Smaller DF with Sentiments')


import seaborn as sns
ax = sns.barplot(data = df_sentiments,
				 x = 'Score',
				 y = 'compound');
ax.set_title('Compound Score vs Customer Stars Review');


fig, axs = plt.subplots( 1, 3, figsize = (20, 7))

sns.barplot(data = df_sentiments, x = 'Score', y = 'pos', ax = axs[0]);
sns.barplot(data = df_sentiments, x = 'Score', y = 'neu', ax = axs[1]);
sns.barplot(data = df_sentiments, x = 'Score', y = 'neg', ax = axs[2]);
axs[0].set_title('Positive Sentiment', fontsize = 25)
axs[0].set_xlabel('', size = 10); axs[0].set_ylabel('pos', size = 10)
axs[1].set_title('Neutral Sentiment', fontsize = 25)
axs[1].set_xlabel('',size = 10); axs[1].set_ylabel('neu', size = 10)
axs[2].set_title('Negative Sentiment', fontsize = 25)
axs[2].set_xlabel('',size = 10); axs[2].set_ylabel('neg', size = 10)

plt.tight_layout()
# ax.set_title('Positive Score vs Customer Stars Review');


from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


print(example)
analyzer.polarity_scores(example)

This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go.

{'neg': 0.22, 'neu': 0.78, 'pos': 0.0, 'compound': -0.5448}


# pt means PyTorch
encoded_text = tokenizer(example, return_tensors='pt')
encoded_text

{'input_ids': tensor([[    0,   713,  1021, 38615,    16,    45,   205,     4,  3139, 39589,
           219,     6,  3793,     6,    38,   218,    75,   101,    24,     4,
          3232,  4218,   384,  2923,    16,     5,   169,     7,   213,     4,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}


output = model(**encoded_text)
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.1436, -0.7107, -2.6559]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


scores = output[0][0].detach().numpy()
scores

array([ 3.1436293, -0.7106685, -2.6558964], dtype=float32)


# softmax puts the 3 categories between 0 and 1, i.e. percentages
scores = softmax(scores)
scores

array([0.97635514, 0.02068748, 0.00295737], dtype=float32)


scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.97635514, 'roberta_neu': 0.020687476, 'roberta_pos': 0.0029573706}


def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict


def roberta_results(df):
    results = {}
    baddies = []
    for idx, row in tq(df.iterrows(), total=len(df)):
        try:
            text = row['Text']
            myid = row['Id']
            vader_result = analyzer.polarity_scores(text)
            vader_result_rename = {}
            for key, value in vader_result.items():
                vader_result_rename[f"vader_{key}"] = value
            roberta_result = polarity_scores_roberta(text)
            both = {**vader_result_rename, **roberta_result}
            results[myid] = both
        except RuntimeError:
           # print(f'Broke for id {myid}')
            baddies.append(myid)

    header_text(f'There were {len(baddies)} dropped samples that failed.')
    return results, baddies


results, baddies = roberta_results(df_sentiments)


results_df = pd.DataFrame(results).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')


# results_df.to_csv('amazon_sentiment_results.csv')


head_tail_vert(results_df[['Id','vader_neg','vader_neu',
                           'vader_pos','vader_compound', 
                           'roberta_neu', 'roberta_pos']],
                           5, 'Results dataframe')


sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()


results_df.query('Score == 1') \
    .sort_values('roberta_pos', ascending=False)['Text'].values[0]

'Bisquick GF is easy to use. Pancakes and muffins are very<br />tasty. The product is quick and easy to use. It makes my day.  Gram'


results_df.query('Score == 1') \
    .sort_values('vader_pos', ascending=False)['Text'].values[0]

'This flavor is horrible.  There are many other flavors much better.  Hawaiian Hazelnut is great!  Breakfast in Bed is AWesome!'


results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

'THEY ARE DELICIOUS NOT SALTY YOU CAN SIT AND EAT 2-3 2 OUNCE BAGS AT A TIME I GIVE THESE 5 STARS PLUS I HAVE TO STOP MYSELF FROM EATING THEM THEY ARE DELICIOUS.'


results_df.query('Score == 5') \
    .sort_values('vader_neg', ascending=False)['Text'].values[0]

'My two cats must not be interested in grass, because it grew but they ignored it. Had no problems growing it.'


from transformers import pipeline


analyze_sentiment = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


analyze_sentiment('I love sentiment analysis!')

[{'label': 'POSITIVE', 'score': 0.9997853636741638}]


analyze_sentiment('Make sure to like and subscribe!')

[{'label': 'POSITIVE', 'score': 0.9991742968559265}]


analyze_sentiment('booo')

[{'label': 'NEGATIVE', 'score': 0.9936267137527466}]


analyze_sentiment('This place is a mess.')

[{'label': 'NEGATIVE', 'score': 0.9997958540916443}]


analyze_sentiment('That is barely interesting.')

[{'label': 'NEGATIVE', 'score': 0.999799907207489}]

\| Top \| The Data \| Basic NLTK \| Vader Scoring \| Pre-trained Roberta \| Comparing Models \| Quick Pipelining \|
NLP with NLTK & Transformers
\| YouTube \| Hugging Face Models \| Project Notebook \| \| \|
- VADER (Valence Aware Dictionary and Sentiment Reasoner) - bag of words approach - Roberta pretrainind model from Hugging Face - more advanced, transformer model - Huggingface Pipeline

	Text
0	I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than most.
1	Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
2	This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar. And it is a tiny mouthful of heaven. Not too chewy, and very flavorful. I highly recommend this yummy treat. If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.

	Text
568451	These stars are small, so you can give 10-15 of those in one training session. I tried to train our dog with "Ceaser dog treats", it just made our puppy hyper. If you compare the ingredients, you will know why. Little stars has just basic food ingredients without any preservatives and food coloring. Sweet potato flavor also did not make my hand smell like dog food.
568452	These are the BEST treats for training and rewarding your dog for being good while grooming. Lower in calories and loved by all the doggies. Sweet potatoes seem to be their favorite Wet Noses treat!
568453	I am very satisfied ,product is as advertised, I use it on cereal, with raw vinegar, and as a general sweetner.

	neg	neu	pos	compound	Score	Text
11108	0.09	0.85	0.07	-0.41	3	Olive oil in large bold print on the front of the bag is misleading!! Olive oil is NOT the main oil the chips are cooked in. The product has more sunflower, corn and canola oils than it does olive oil!! The ingredients are NOT listed in the description. I should have known better.<br />There are other, much better "Olive Oil" potato chips on the market that use ONLY olive oil. I was quite disappointed in this brand.
11109	0.10	0.81	0.09	-0.46	1	I love salt & vinegar kettle chips so I decided to try these after reading good reviews. When I received them, I noticed from the very first bag that over half of the chips were inedible due to rotten potatoes! I know how kettle chips should look, and I know what normal marks look like on chips. These were rotten. After waiting over a month for a response from the company, I finally found a phone number and called them, only to be told there was nothing they could do. The sad thing is, the chips that were edible were actually flavorful and tasty. Obviously they just don't care about consistent quality, so stay away from these chips!
11110	0.00	0.93	0.07	0.44	2	The kettle chip itself is crisp, crunchy and good, although somewhat greasy. Over coated with the flavoring causing mouth burn when it was not expected in a chip with Rosemary flavoring. Leaves greasy, gritty residue on fingers after two or three chips.

	Id	vader_neg	vader_neu	vader_pos	vader_compound	roberta_neu	roberta_pos
0	1	0.00	0.69	0.30	0.94	0.05	0.94
1	2	0.14	0.86	0.00	-0.57	0.45	0.04
2	3	0.09	0.75	0.15	0.83	0.10	0.90
3	4	0.00	1.00	0.00	0.00	0.09	0.91
4	5	0.00	0.55	0.45	0.95	0.01	0.99

\| Top \| The Data \| Basic NLTK \| Vader Scoring \| Pre-trained Roberta \| Comparing Models \| Quick Pipelining \|
The Data
- Sentiment analysis performed on Amazon fine food text reviews - Also includes a rating out of 5 stars - CSV format

\| Top \| The Data \| Basic NLTK \| Vader Scoring \| Pre-trained Roberta \| Comparing Models \| Quick Pipelining \|
Vader Sentiment Scoring
Using a 'bag of words' approach - stopwords are removed (such as and, the, an) - each word is scored and then combined for a total score

\| Top \| The Data \| Basic NLTK \| Vader Scoring \| Pre-trained Roberta \| Comparing Models \| Quick Pipelining \|
Roberta Pretrained Model
- Also takes into account context and more subtle, connotational meanings - transformer-based, deep learning models like this pick up on the relationships between words and contexts - these are Hugging Face models - this model has been pretrained extensively on sentiment data

	Id	vader_neg	vader_neu	vader_pos	vader_compound	roberta_neu	roberta_pos
11010	11107	0.00	0.75	0.25	0.96	0.03	0.97
11011	11108	0.00	0.57	0.43	0.94	0.19	0.78
11012	11109	0.09	0.85	0.07	-0.41	0.10	0.01
11013	11110	0.10	0.81	0.09	-0.46	0.25	0.12
11014	11111	0.00	0.93	0.07	0.44	0.49	0.20

tokens[:10]
['This', 'oatmeal', 'is', 'not', 'good', '.', 'Its', 'mushy', ',', 'soft']

tagged[:10]
[('This', 'DT'), ('oatmeal', 'NN'), ('is', 'VBZ'), ('not', 'RB'), ('good', 'JJ'), ('.', '.'), ('Its', 'PRP$'), ('mushy', 'NN'), (',', ','), ('soft', 'JJ')]

len(groups)
24

Length of entire dataframe:
568,454