ibrahimsharaf · ibrahimsharaf · Jun 14, 2019 · Apr 27, 2019 · Jun 14, 2019 · Jun 14, 2019
diff --git a/.gitignore b/.gitignore
@@ -155,3 +155,5 @@ crashlytics.properties
 crashlytics-build.properties
 fabric.properties
 
+# .idea
+.idea/
diff --git a/.travis.yml b/.travis.yml
@@ -5,4 +5,4 @@
   - sudo rm -f /etc/boto.cfg
   - pip install -r requirements.txt
  script:
-    - python3 model.py
+   - python3 text_classifier.py
diff --git a/classifiers/__init__.py b/classifiers/__init__.py
diff --git a/data/__init__.py b/data/__init__.py
diff --git a/dataset.csv → data/dataset.csv b/dataset.csv → data/dataset.csv
diff --git a/model.py b/model.py
diff --git a/models/__init__.py b/models/__init__.py
diff --git a/models/classifier_model.py b/models/classifier_model.py
@@ -0,0 +1,67 @@
+from .model import Model
+from .doc2vec_model import doc2VecModel
+
+import logging
+import os
+import inspect
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, f1_score
+
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
+base_path = os.path.dirname(os.path.abspath(base_file_path))
+project_dir_path = os.path.dirname(os.path.abspath(base_path))
+classifiers_path = os.path.join(project_dir_path, 'classifiers')
+
+
+class classifierModel(Model):
+    def __init__(self):
+        super().__init__()
+
+    def initialize_model(self):
+        self.model = LogisticRegression()
+
+    def train_model(self, d2v, training_vectors, training_labels):
+        logging.info("Classifier training")
+        train_vectors = doc2VecModel.get_vectors(
+            d2v, len(training_vectors), 300, 'Train')
+        self.model.fit(train_vectors, np.array(training_labels))
+        training_predictions = self.model.predict(train_vectors)
+        logging.info(
+            'Training predicted classes: {}'.format(np.unique(
+                training_predictions)))
+        logging.info(
+            'Training accuracy: {}'.format(
+                accuracy_score(training_labels, training_predictions)))
+        logging.info(
+            'Training F1 score: {}'.format(
+                f1_score(
+                    training_labels, training_predictions,
+                    average='weighted')))
+
+    def test_model(self, d2v, testing_vectors, testing_labels):
+        logging.info("Classifier testing")
+        test_vectors = doc2VecModel.get_vectors(
+            d2v, len(testing_vectors), 300, 'Test')
+        testing_predictions = self.model.predict(test_vectors)
+        logging.info(
+            'Testing predicted classes: {}'.format(
+                np.unique(testing_predictions)))
+        logging.info(
+            'Testing accuracy: {}'.format(
+                accuracy_score(testing_labels, testing_predictions)))
+        logging.info(
+            'Testing F1 score: {}'.format(
+                f1_score(
+                    testing_labels, testing_predictions,
+                    average='weighted')))
+
+    def predict(self, d2v, testing_vectors):
+        logging.info("Classifier Predicting")
+        test_vectors = doc2VecModel.get_vectors(
+            d2v, len(testing_vectors), 300, 'Test')
+        testing_predictions = self.model.predict(test_vectors)
+        logging.info(testing_predictions)
diff --git a/models/doc2vec_model.py b/models/doc2vec_model.py
@@ -0,0 +1,91 @@
+from .model import Model
+
+import logging
+import random
+import os
+import inspect
+
+import numpy as np
+from gensim.models import doc2vec
+
+
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
+base_path = os.path.dirname(os.path.abspath(base_file_path))
+project_dir_path = os.path.dirname(os.path.abspath(base_path))
+classifiers_path = os.path.join(project_dir_path, 'classifiers')
+
+
+class doc2VecModel(Model):
+
+    def __init__(self):
+        super().__init__()
+
+    def initialize_model(self, corpus):
+        logging.info("Building Doc2Vec vocabulary")
+        self.corpus = corpus
+        self.model = doc2vec.Doc2Vec(min_count=1,
+                                     # Ignores all words with
+                                     # total frequency lower than this
+                                     window=10,
+                                     # The maximum distance between the current
+                                     #  and predicted word within a sentence
+                                     vector_size=300,  # Dimensionality of the
+                                     #  generated feature vectors
+                                     workers=5,  # Number of worker threads to
+                                     #  train the model
+                                     alpha=0.025,  # The initial learning rate
+                                     min_alpha=0.00025,
+                                     # Learning rate will linearly drop to
+                                     # min_alpha as training progresses
+                                     dm=1)
+        # dm defines the training algorithm.
+        #  If dm=1 means 'distributed memory' (PV-DM)
+        # and dm =0 means 'distributed bag of words' (PV-DBOW)
+        self.model.build_vocab(self.corpus)
+
+    def train_model(self):
+        logging.info("Training Doc2Vec model")
+        # 10 epochs take around 10 minutes on my machine (i7),
+        #  if you have more time/computational power make it 20
+        for epoch in range(10):
+            logging.info('Training iteration #{0}'.format(epoch))
+            self.model.train(
+                self.corpus, total_examples=self.model.corpus_count,
+                epochs=self.model.epochs)
+            # shuffle the corpus
+            random.shuffle(self.corpus)
+            # decrease the learning rate
+            self.model.alpha -= 0.0002
+            # fix the learning rate, no decay
+            self.model.min_alpha = self.model.alpha
+
+    def get_vectors(self, corpus_size, vectors_size, vectors_type):
+        """
+        Get vectors from trained doc2vec model
+        :param doc2vec_model: Trained Doc2Vec model
+        :param corpus_size: Size of the data
+        :param vectors_size: Size of the embedding vectors
+        :param vectors_type: Training or Testing vectors
+        :return: list of vectors
+        """
+        vectors = np.zeros((corpus_size, vectors_size))
+        for i in range(0, corpus_size):
+            prefix = vectors_type + '_' + str(i)
+            vectors[i] = self.model.docvecs[prefix]
+        return vectors
+
+    def label_sentences(corpus, label_type):
+        """
+        Gensim's Doc2Vec implementation requires each
+         document/paragraph to have a label associated with it.
+        We do this by using the LabeledSentence method.
+        The format will be "TRAIN_i" or "TEST_i" where "i" is
+        a dummy index of the review.
+        """
+        labeled = []
+        for i, v in enumerate(corpus):
+            label = label_type + '_' + str(i)
+            labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
+        return labeled
diff --git a/models/model.py b/models/model.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+
+
+class Model(ABC):
+
+    def __init__(self):
+        self.model = None
+        super().__init__()
+
+    @abstractmethod
+    def initialize_model(self):
+        pass
+
+    @abstractmethod
+    def train_model(self):
+        pass
diff --git a/text_classifier.py b/text_classifier.py
@@ -0,0 +1,74 @@
+from models.doc2vec_model import doc2VecModel
+from models.classifier_model import classifierModel
+
+import os
+import logging
+import inspect
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+logging.basicConfig(
+    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
+base_file_path = inspect.getframeinfo(inspect.currentframe()).filename
+project_dir_path = os.path.dirname(os.path.abspath(base_file_path))
+data_path = os.path.join(project_dir_path, 'data')
+default_classifier = os.path.join(
+    project_dir_path, 'classifiers', 'logreg_model.pkl')
+default_doc2vec = os.path.join(project_dir_path, 'classifiers', 'd2v.model')
+default_dataset = os.path.join(data_path, 'dataset.csv')
+
+
+class TextClassifier():
+
+    def __init__(self):
+        super().__init__()
+        self.d2v = doc2VecModel()
+        self.classifier = classifierModel()
+        self.dataset = None
+
+    def read_data(self, filename):
+        filename = os.path.join(data_path, filename)
+        self.dataset = pd.read_csv(filename, header=0, delimiter="\t")
+
+    def prepare_all_data(self):
+        x_train, x_test, y_train, y_test = train_test_split(
+            self.dataset.review, self.dataset.sentiment, random_state=0,
+            test_size=0.1)
+        x_train = doc2VecModel.label_sentences(x_train, 'Train')
+        x_test = doc2VecModel.label_sentences(x_test, 'Test')
+        all_data = x_train + x_test
+        return x_train, x_test, y_train, y_test, all_data
+
+    def prepare_test_data(self, sentence):
+        x_test = doc2VecModel.label_sentences(sentence, 'Test')
+        return x_test
+
+    def train_classifier(self):
+        x_train, x_test, y_train, y_test, all_data = self.prepare_all_data()
+        self.d2v.initialize_model(all_data)
+        self.d2v.train_model()
+        self.classifier.initialize_model()
+        self.classifier.train_model(self.d2v, x_train, y_train)
+        self.classifier.test_model(self.d2v, x_test, y_test)
+        return self.d2v, self.classifier
+
+    def test_classifier(self):
+        _, x_test, _, y_test, _ = self.prepare_all_data()
+        if (self.d2v.model is None or self.classifier.model is None):
+            logging.info(
+                "Models Not Found, Train First or Use Correct Model Names")
+        else:
+            self.classifier.test_model(self.d2v, x_test, y_test)
+
+
+def run(dataset_file):
+    tc = TextClassifier()
+    tc.read_data(dataset_file)
+    tc.test_classifier()
+    tc.train_classifier()
+
+
+if __name__ == "__main__":
+    run("dataset.csv")
-Original file line number
+Diff line change
@@ Expand Up / @@ -155,3 +155,5 @@ crashlytics.properties @@
     crashlytics-build.properties
     fabric.properties
+    # .idea
+    .idea/