Skip to content

Convert script to OOP format #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 14, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,5 @@ crashlytics.properties
crashlytics-build.properties
fabric.properties

# .idea
.idea/
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
- sudo rm -f /etc/boto.cfg
- pip install -r requirements.txt
script:
- python3 model.py
- python3 text_classifier.py
Empty file added classifiers/__init__.py
Empty file.
Empty file added data/__init__.py
Empty file.
File renamed without changes.
107 changes: 0 additions & 107 deletions model.py

This file was deleted.

Empty file added models/__init__.py
Empty file.
67 changes: 67 additions & 0 deletions models/classifier_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from .model import Model
from .doc2vec_model import doc2VecModel

import logging
import os
import inspect

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
base_path = os.path.dirname(os.path.abspath(base_file_path))
project_dir_path = os.path.dirname(os.path.abspath(base_path))
classifiers_path = os.path.join(project_dir_path, 'classifiers')


class classifierModel(Model):
def __init__(self):
super().__init__()

def initialize_model(self):
self.model = LogisticRegression()

def train_model(self, d2v, training_vectors, training_labels):
logging.info("Classifier training")
train_vectors = doc2VecModel.get_vectors(
d2v, len(training_vectors), 300, 'Train')
self.model.fit(train_vectors, np.array(training_labels))
training_predictions = self.model.predict(train_vectors)
logging.info(
'Training predicted classes: {}'.format(np.unique(
training_predictions)))
logging.info(
'Training accuracy: {}'.format(
accuracy_score(training_labels, training_predictions)))
logging.info(
'Training F1 score: {}'.format(
f1_score(
training_labels, training_predictions,
average='weighted')))

def test_model(self, d2v, testing_vectors, testing_labels):
logging.info("Classifier testing")
test_vectors = doc2VecModel.get_vectors(
d2v, len(testing_vectors), 300, 'Test')
testing_predictions = self.model.predict(test_vectors)
logging.info(
'Testing predicted classes: {}'.format(
np.unique(testing_predictions)))
logging.info(
'Testing accuracy: {}'.format(
accuracy_score(testing_labels, testing_predictions)))
logging.info(
'Testing F1 score: {}'.format(
f1_score(
testing_labels, testing_predictions,
average='weighted')))

def predict(self, d2v, testing_vectors):
logging.info("Classifier Predicting")
test_vectors = doc2VecModel.get_vectors(
d2v, len(testing_vectors), 300, 'Test')
testing_predictions = self.model.predict(test_vectors)
logging.info(testing_predictions)
91 changes: 91 additions & 0 deletions models/doc2vec_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from .model import Model

import logging
import random
import os
import inspect

import numpy as np
from gensim.models import doc2vec


logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
base_path = os.path.dirname(os.path.abspath(base_file_path))
project_dir_path = os.path.dirname(os.path.abspath(base_path))
classifiers_path = os.path.join(project_dir_path, 'classifiers')


class doc2VecModel(Model):

def __init__(self):
super().__init__()

def initialize_model(self, corpus):
logging.info("Building Doc2Vec vocabulary")
self.corpus = corpus
self.model = doc2vec.Doc2Vec(min_count=1,
# Ignores all words with
# total frequency lower than this
window=10,
# The maximum distance between the current
# and predicted word within a sentence
vector_size=300, # Dimensionality of the
# generated feature vectors
workers=5, # Number of worker threads to
# train the model
alpha=0.025, # The initial learning rate
min_alpha=0.00025,
# Learning rate will linearly drop to
# min_alpha as training progresses
dm=1)
# dm defines the training algorithm.
# If dm=1 means 'distributed memory' (PV-DM)
# and dm =0 means 'distributed bag of words' (PV-DBOW)
self.model.build_vocab(self.corpus)

def train_model(self):
logging.info("Training Doc2Vec model")
# 10 epochs take around 10 minutes on my machine (i7),
# if you have more time/computational power make it 20
for epoch in range(10):
logging.info('Training iteration #{0}'.format(epoch))
self.model.train(
self.corpus, total_examples=self.model.corpus_count,
epochs=self.model.epochs)
# shuffle the corpus
random.shuffle(self.corpus)
# decrease the learning rate
self.model.alpha -= 0.0002
# fix the learning rate, no decay
self.model.min_alpha = self.model.alpha

def get_vectors(self, corpus_size, vectors_size, vectors_type):
"""
Get vectors from trained doc2vec model
:param doc2vec_model: Trained Doc2Vec model
:param corpus_size: Size of the data
:param vectors_size: Size of the embedding vectors
:param vectors_type: Training or Testing vectors
:return: list of vectors
"""
vectors = np.zeros((corpus_size, vectors_size))
for i in range(0, corpus_size):
prefix = vectors_type + '_' + str(i)
vectors[i] = self.model.docvecs[prefix]
return vectors

def label_sentences(corpus, label_type):
"""
Gensim's Doc2Vec implementation requires each
document/paragraph to have a label associated with it.
We do this by using the LabeledSentence method.
The format will be "TRAIN_i" or "TEST_i" where "i" is
a dummy index of the review.
"""
labeled = []
for i, v in enumerate(corpus):
label = label_type + '_' + str(i)
labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
return labeled
16 changes: 16 additions & 0 deletions models/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from abc import ABC, abstractmethod


class Model(ABC):

def __init__(self):
self.model = None
super().__init__()

@abstractmethod
def initialize_model(self):
pass

@abstractmethod
def train_model(self):
pass
74 changes: 74 additions & 0 deletions text_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from models.doc2vec_model import doc2VecModel
from models.classifier_model import classifierModel

import os
import logging
import inspect

import pandas as pd
from sklearn.model_selection import train_test_split


logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
project_dir_path = os.path.dirname(os.path.abspath(base_file_path))
data_path = os.path.join(project_dir_path, 'data')
default_classifier = os.path.join(
project_dir_path, 'classifiers', 'logreg_model.pkl')
default_doc2vec = os.path.join(project_dir_path, 'classifiers', 'd2v.model')
default_dataset = os.path.join(data_path, 'dataset.csv')


class TextClassifier():

def __init__(self):
super().__init__()
self.d2v = doc2VecModel()
self.classifier = classifierModel()
self.dataset = None

def read_data(self, filename):
filename = os.path.join(data_path, filename)
self.dataset = pd.read_csv(filename, header=0, delimiter="\t")

def prepare_all_data(self):
x_train, x_test, y_train, y_test = train_test_split(
self.dataset.review, self.dataset.sentiment, random_state=0,
test_size=0.1)
x_train = doc2VecModel.label_sentences(x_train, 'Train')
x_test = doc2VecModel.label_sentences(x_test, 'Test')
all_data = x_train + x_test
return x_train, x_test, y_train, y_test, all_data

def prepare_test_data(self, sentence):
x_test = doc2VecModel.label_sentences(sentence, 'Test')
return x_test

def train_classifier(self):
x_train, x_test, y_train, y_test, all_data = self.prepare_all_data()
self.d2v.initialize_model(all_data)
self.d2v.train_model()
self.classifier.initialize_model()
self.classifier.train_model(self.d2v, x_train, y_train)
self.classifier.test_model(self.d2v, x_test, y_test)
return self.d2v, self.classifier

def test_classifier(self):
_, x_test, _, y_test, _ = self.prepare_all_data()
if (self.d2v.model is None or self.classifier.model is None):
logging.info(
"Models Not Found, Train First or Use Correct Model Names")
else:
self.classifier.test_model(self.d2v, x_test, y_test)


def run(dataset_file):
tc = TextClassifier()
tc.read_data(dataset_file)
tc.test_classifier()
tc.train_classifier()


if __name__ == "__main__":
run("dataset.csv")