SlideShare a Scribd company logo
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
gmodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
def extract_words(sent):
sent = sent.lower()
sent = re.sub(r'<[^>]+>', ' ', sent) # strip html tags
sent = re.sub(r'(w)'(w)', '12', sent) # remove apostrophes
sent = re.sub(r'W', ' ', sent) # remove punctuation
sent = re.sub(r's+', ' ', sent) # remove repeated spaces
sent = sent.strip()
return sent.split()
# unsupervised training data
import re
import os
unsup_sentences = []
# source: http://ai.stanford.edu/~amaas/data/sentiment/, data from IMDB
for dirname in ["train/pos", "train/neg", "train/unsup", "test/pos", "test/neg"]:
for fname in sorted(os.listdir("aclImdb/" + dirname)):
if fname[-4:] == '.txt':
with open("aclImdb/" + dirname + "/" + fname, encoding='UTF-8') as f:
sent = f.read()
words = extract_words(sent)
unsup_sentences.append(TaggedDocument(words, [dirname + "/" + fname]))
# source: http://www.cs.cornell.edu/people/pabo/movie-review-data/
for dirname in ["review_polarity/txt_sentoken/pos", "review_polarity/txt_sentoken/neg"]:
for fname in sorted(os.listdir(dirname)):
if fname[-4:] == '.txt':
with open(dirname + "/" + fname, encoding='UTF-8') as f:
for i, sent in enumerate(f):
words = extract_words(sent)
unsup_sentences.append(TaggedDocument(words, ["%s/%s-%d" % (dirname, fname, i)]))
# source: https://nlp.stanford.edu/sentiment/, data from Rotten Tomatoes
with open("stanfordSentimentTreebank/original_rt_snippets.txt", encoding='UTF-8') as f:
for i, line in enumerate(f):
words = extract_words(sent)
unsup_sentences.append(TaggedDocument(words, ["rt-%d" % i]))
import random
class PermuteSentences(object):
def __init__(self, sents):
self.sents = sents
def __iter__(self):
shuffled = list(self.sents)
random.shuffle(shuffled)
for sent in shuffled:
yield sent
permuter = PermuteSentences(unsup_sentences)
model = Doc2Vec(permuter, dm=0, hs=1, size=50)
# done with training, free up some memory
model.delete_temporary_training_data(keep_inference=True)
model.save('reviews.d2v')
# in other program, we could write: model = Doc2Vec.load('reviews.d2v')
sentences = []
sentvecs = []
sentiments = []
for fname in ["yelp", "amazon_cells", "imdb"]:
with open("sentiment labelled sentences/%s_labelled.txt" % fname, encoding='UTF-8') as f:
for i, line in enumerate(f):
line_split = line.strip().split('t')
sentences.append(line_split[0])
words = extract_words(line_split[0])
sentvecs.append(model.infer_vector(words, steps=10)) # create a vector for this document
sentiments.append(int(line_split[1]))
# shuffle sentences, sentvecs, sentiments together
combined = list(zip(sentences, sentvecs, sentiments))
random.shuffle(combined)
sentences, sentvecs, sentiments = zip(*combined)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
clf = KNeighborsClassifier(n_neighbors=9)
clfrf = RandomForestClassifier()
scores = cross_val_score(clf, sentvecs, sentiments, cv=5)
print((np.mean(scores), np.std(scores)))
scores = cross_val_score(clfrf, sentvecs, sentiments, cv=5)
print((np.mean(scores), np.std(scores)))
# bag-of-words comparison
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier())
scores = cross_val_score(pipeline, sentences, sentiments, cv=5)
print((np.mean(scores), np.std(scores)))
a

More Related Content

PDF
Clojure functions examples
ODT
linieaire regressie
PPT
Aggregate functions
PPT
Mocking Dependencies in PHPUnit
KEY
テストデータどうしてますか?
PDF
Symfony CoP: Form component
TXT
Data20161007
PDF
Doctrine fixtures
Clojure functions examples
linieaire regressie
Aggregate functions
Mocking Dependencies in PHPUnit
テストデータどうしてますか?
Symfony CoP: Form component
Data20161007
Doctrine fixtures

What's hot (20)

PDF
How I started to love design patterns
PDF
PHPUnit でよりよくテストを書くために
PDF
Symfony2 - extending the console component
PPTX
Print input-presentation
PDF
Code moi une RH! (PHP tour 2017)
PPTX
Elf文件解析
PDF
international PHP2011_Bastian Feder_jQuery's Secrets
PDF
Designing Immutability Data Flows in Ember
PDF
CQRS and Event Sourcing in a Symfony application
KEY
Data::FormValidator Simplified
PDF
PHP for Python Developers
PDF
Informatics Practices/ Information Practices Project (IP Project Class 12)
PPTX
Groovy puzzlers jug-moscow-part 2
PDF
3分くらいで分かるassert()
PDF
Difference between mysql_fetch_array and mysql_fetch_assoc in PHP
PDF
How I started to love design patterns
PDF
Cyclejs introduction
PDF
Fog City Ruby - Triple Equals Black Magic
DOCX
Opp compile
PDF
Mocking Demystified
How I started to love design patterns
PHPUnit でよりよくテストを書くために
Symfony2 - extending the console component
Print input-presentation
Code moi une RH! (PHP tour 2017)
Elf文件解析
international PHP2011_Bastian Feder_jQuery's Secrets
Designing Immutability Data Flows in Ember
CQRS and Event Sourcing in a Symfony application
Data::FormValidator Simplified
PHP for Python Developers
Informatics Practices/ Information Practices Project (IP Project Class 12)
Groovy puzzlers jug-moscow-part 2
3分くらいで分かるassert()
Difference between mysql_fetch_array and mysql_fetch_assoc in PHP
How I started to love design patterns
Cyclejs introduction
Fog City Ruby - Triple Equals Black Magic
Opp compile
Mocking Demystified
Ad

Similar to Detect Negative and Positive sentiment in user reviews using python word2vec code (20)

PDF
Django (Web Konferencia 2009)
PDF
Python magicmethods
ODP
Pruebas unitarias con django
PDF
Тестирование и Django
DOCX
AIMLProgram-6 AIMLProgram-6 AIMLProgram-6 AIMLProgram-6
PDF
Unit test
PDF
Python Unit Test
PDF
Separation of concerns - DPC12
PPTX
Django - sql alchemy - jquery
KEY
CoffeeScript - A Rubyist's Love Affair
PDF
Python Ireland Nov 2010 Talk: Unit Testing
KEY
Testing My Patience
PDF
Machine Learning Algorithms
PDF
Python programming : Inheritance and polymorphism
PDF
Object Orientation vs Functional Programming in Python
PPTX
JavaScript Advanced - Useful methods to power up your code
PDF
Python Cheat Sheet for Data Analysis.pdf
PDF
python sheat sheet for Data analysis.pdf
PDF
Python Cheat Sheet for Data Analysis.pdf
PDF
python codes
Django (Web Konferencia 2009)
Python magicmethods
Pruebas unitarias con django
Тестирование и Django
AIMLProgram-6 AIMLProgram-6 AIMLProgram-6 AIMLProgram-6
Unit test
Python Unit Test
Separation of concerns - DPC12
Django - sql alchemy - jquery
CoffeeScript - A Rubyist's Love Affair
Python Ireland Nov 2010 Talk: Unit Testing
Testing My Patience
Machine Learning Algorithms
Python programming : Inheritance and polymorphism
Object Orientation vs Functional Programming in Python
JavaScript Advanced - Useful methods to power up your code
Python Cheat Sheet for Data Analysis.pdf
python sheat sheet for Data analysis.pdf
Python Cheat Sheet for Data Analysis.pdf
python codes
Ad

More from Mamoon Ismail Khalid (20)

PDF
Caring.ai - AI + Voice Agent co-pilot for all things dementia
PPTX
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
PPTX
Network Traffic Adaptable Image Codec - A solution to make streaming faster
PDF
Hospital Management and Inventory Control Solution for Public Hospitals in De...
PDF
ATLAS - Product Requirement Document.pdf
PDF
T(X) Innoway - Prediction Algorithm design.pdf
PDF
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
PDF
Golf Swing Analysis and Posture Correction System
PDF
24 ideas to revive any developing country.pdf
PDF
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
PDF
PyTorch to detect Humans Eating Food.pdf
PDF
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
PDF
PDF
Real estate in blockchain (2)
PDF
Cohort analysis saa s (1)
PDF
ISA backed technology skills platform
PDF
Start up valuation methods
PDF
Analysis mvp factory
PDF
Detect spam comments youtube videos and app store reviews
PPTX
Start Up deal/interaction management workflow
Caring.ai - AI + Voice Agent co-pilot for all things dementia
REMOTE SOLAR MONITORING SYSTEM - A solution to make battery life extend by 300%
Network Traffic Adaptable Image Codec - A solution to make streaming faster
Hospital Management and Inventory Control Solution for Public Hospitals in De...
ATLAS - Product Requirement Document.pdf
T(X) Innoway - Prediction Algorithm design.pdf
Joint3DShapeMatching - a fast approach to 3D model matching using MatchALS 3...
Golf Swing Analysis and Posture Correction System
24 ideas to revive any developing country.pdf
#2 - Smart Bins - Returnable Plastic Ecosystem.pdf
PyTorch to detect Humans Eating Food.pdf
Future of agriculture agriculture - technology is a necessity in 2020 and beyond
Real estate in blockchain (2)
Cohort analysis saa s (1)
ISA backed technology skills platform
Start up valuation methods
Analysis mvp factory
Detect spam comments youtube videos and app store reviews
Start Up deal/interaction management workflow

Recently uploaded (20)

PDF
Structs to JSON How Go Powers REST APIs.pdf
PPTX
Recipes for Real Time Voice AI WebRTC, SLMs and Open Source Software.pptx
PPTX
KTU 2019 -S7-MCN 401 MODULE 2-VINAY.pptx
PPT
Mechanical Engineering MATERIALS Selection
PPTX
Lecture Notes Electrical Wiring System Components
PPTX
M Tech Sem 1 Civil Engineering Environmental Sciences.pptx
PPTX
MET 305 2019 SCHEME MODULE 2 COMPLETE.pptx
PPTX
MCN 401 KTU-2019-PPE KITS-MODULE 2.pptx
PPTX
Sustainable Sites - Green Building Construction
PPTX
Strings in CPP - Strings in C++ are sequences of characters used to store and...
PDF
Digital Logic Computer Design lecture notes
PDF
keyrequirementskkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk
PDF
BMEC211 - INTRODUCTION TO MECHATRONICS-1.pdf
PPTX
Infosys Presentation by1.Riyan Bagwan 2.Samadhan Naiknavare 3.Gaurav Shinde 4...
DOCX
573137875-Attendance-Management-System-original
PDF
Operating System & Kernel Study Guide-1 - converted.pdf
PDF
Well-logging-methods_new................
PPTX
FINAL REVIEW FOR COPD DIANOSIS FOR PULMONARY DISEASE.pptx
PDF
July 2025 - Top 10 Read Articles in International Journal of Software Enginee...
PPTX
OOP with Java - Java Introduction (Basics)
Structs to JSON How Go Powers REST APIs.pdf
Recipes for Real Time Voice AI WebRTC, SLMs and Open Source Software.pptx
KTU 2019 -S7-MCN 401 MODULE 2-VINAY.pptx
Mechanical Engineering MATERIALS Selection
Lecture Notes Electrical Wiring System Components
M Tech Sem 1 Civil Engineering Environmental Sciences.pptx
MET 305 2019 SCHEME MODULE 2 COMPLETE.pptx
MCN 401 KTU-2019-PPE KITS-MODULE 2.pptx
Sustainable Sites - Green Building Construction
Strings in CPP - Strings in C++ are sequences of characters used to store and...
Digital Logic Computer Design lecture notes
keyrequirementskkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk
BMEC211 - INTRODUCTION TO MECHATRONICS-1.pdf
Infosys Presentation by1.Riyan Bagwan 2.Samadhan Naiknavare 3.Gaurav Shinde 4...
573137875-Attendance-Management-System-original
Operating System & Kernel Study Guide-1 - converted.pdf
Well-logging-methods_new................
FINAL REVIEW FOR COPD DIANOSIS FOR PULMONARY DISEASE.pptx
July 2025 - Top 10 Read Articles in International Journal of Software Enginee...
OOP with Java - Java Introduction (Basics)

Detect Negative and Positive sentiment in user reviews using python word2vec code

  • 1. import gensim, logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) gmodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) from gensim.models.doc2vec import TaggedDocument from gensim.models import Doc2Vec def extract_words(sent): sent = sent.lower() sent = re.sub(r'<[^>]+>', ' ', sent) # strip html tags sent = re.sub(r'(w)'(w)', '12', sent) # remove apostrophes sent = re.sub(r'W', ' ', sent) # remove punctuation sent = re.sub(r's+', ' ', sent) # remove repeated spaces sent = sent.strip() return sent.split() # unsupervised training data import re import os unsup_sentences = [] # source: http://ai.stanford.edu/~amaas/data/sentiment/, data from IMDB for dirname in ["train/pos", "train/neg", "train/unsup", "test/pos", "test/neg"]: for fname in sorted(os.listdir("aclImdb/" + dirname)): if fname[-4:] == '.txt': with open("aclImdb/" + dirname + "/" + fname, encoding='UTF-8') as f: sent = f.read() words = extract_words(sent) unsup_sentences.append(TaggedDocument(words, [dirname + "/" + fname])) # source: http://www.cs.cornell.edu/people/pabo/movie-review-data/ for dirname in ["review_polarity/txt_sentoken/pos", "review_polarity/txt_sentoken/neg"]: for fname in sorted(os.listdir(dirname)): if fname[-4:] == '.txt':
  • 2. with open(dirname + "/" + fname, encoding='UTF-8') as f: for i, sent in enumerate(f): words = extract_words(sent) unsup_sentences.append(TaggedDocument(words, ["%s/%s-%d" % (dirname, fname, i)])) # source: https://nlp.stanford.edu/sentiment/, data from Rotten Tomatoes with open("stanfordSentimentTreebank/original_rt_snippets.txt", encoding='UTF-8') as f: for i, line in enumerate(f): words = extract_words(sent) unsup_sentences.append(TaggedDocument(words, ["rt-%d" % i])) import random class PermuteSentences(object): def __init__(self, sents): self.sents = sents def __iter__(self): shuffled = list(self.sents) random.shuffle(shuffled) for sent in shuffled: yield sent permuter = PermuteSentences(unsup_sentences) model = Doc2Vec(permuter, dm=0, hs=1, size=50) # done with training, free up some memory model.delete_temporary_training_data(keep_inference=True) model.save('reviews.d2v') # in other program, we could write: model = Doc2Vec.load('reviews.d2v') sentences = [] sentvecs = [] sentiments = [] for fname in ["yelp", "amazon_cells", "imdb"]: with open("sentiment labelled sentences/%s_labelled.txt" % fname, encoding='UTF-8') as f:
  • 3. for i, line in enumerate(f): line_split = line.strip().split('t') sentences.append(line_split[0]) words = extract_words(line_split[0]) sentvecs.append(model.infer_vector(words, steps=10)) # create a vector for this document sentiments.append(int(line_split[1])) # shuffle sentences, sentvecs, sentiments together combined = list(zip(sentences, sentvecs, sentiments)) random.shuffle(combined) sentences, sentvecs, sentiments = zip(*combined) from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score import numpy as np clf = KNeighborsClassifier(n_neighbors=9) clfrf = RandomForestClassifier() scores = cross_val_score(clf, sentvecs, sentiments, cv=5) print((np.mean(scores), np.std(scores))) scores = cross_val_score(clfrf, sentvecs, sentiments, cv=5) print((np.mean(scores), np.std(scores))) # bag-of-words comparison from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), RandomForestClassifier()) scores = cross_val_score(pipeline, sentences, sentiments, cv=5) print((np.mean(scores), np.std(scores))) a