JohnSnowLabs
diff --git a/‎python/sparknlp/annotator.py
Lines changed: 31 additions & 25 deletions b/‎python/sparknlp/annotator.py
Lines changed: 31 additions & 25 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
Lines changed: 6 additions & 0 deletions b/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModel.scala
Lines changed: 7 additions & 2 deletions b/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModel.scala
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproach.scala
Lines changed: 3 additions & 2 deletions b/‎src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproach.scala
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentApproach.scala
Lines changed: 20 additions & 37 deletions b/‎src/main/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentApproach.scala
Lines changed: 20 additions & 37 deletions
diff --git a/‎src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingApproach.scala
Lines changed: 6 additions & 2 deletions b/‎src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingApproach.scala
Lines changed: 6 additions & 2 deletions
@@ -352,41 +352,46 @@ class ViveknSentimentApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, Ann
 
     pruneCorpus = Param(Params._dummy(),
                         "pruneCorpus",
-                        "whether to prune low frequency words",
-                        typeConverter=TypeConverters.toBoolean)
+                        "Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1",
+                        typeConverter=TypeConverters.toInt)
+
+    tokenPattern = Param(Params._dummy(),
+                         "negativeSource",
+                         "Regex pattern to use in tokenization of corpus. Defaults \\S+",
+                         typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self,
                  positiveSource="",
                  negativeSource="",
-                 pruneCorpus=False
+                 pruneCorpus=1,
+                 tokenPattern="\\S+"
                  ):
         super(ViveknSentimentApproach, self).__init__()
         self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach", self.uid)
         kwargs = self._input_kwargs
         self._setDefault(
             positiveSource="",
             negativeSource="",
-            pruneCorpus=False
+            pruneCorpus=1,
+            tokenPattern="\\S+"
         )
         self.setParams(**kwargs)
 
     def setPositiveSource(self, value):
-        self._set(positiveSource=value)
-        return self
+        return self._set(positiveSource=value)
 
     def setNegativeSource(self, value):
-        self._set(negativeSource=value)
-        return self
+        return self._set(negativeSource=value)
 
     def setPruneCorpus(self, value):
-        self._set(pruneCorpus=value)
-        return self
+        return self._set(pruneCorpus=value)
 
     def setParams(self,
                   positiveSource="",
                   negativeSource="",
-                  pruneCorpus=False):
+                  pruneCorpus=1,
+                  tokenPattern="\\S+"):
         kwargs = self._input_kwargs
         return self._set(**kwargs)
 
@@ -414,6 +419,11 @@ class NorvigSweetingApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, Anno
                        "dataset corpus format. txt or txtds allowed only",
                        typeConverter=TypeConverters.toString)
 
+    tokenPattern = Param(Params._dummy(),
+                         "tokenPattern",
+                         "Regex pattern to use in tokenization of corpus. Defaults [a-zA-Z]+",
+                         typeConverter=TypeConverters.toString)
+
     slangPath = Param(Params._dummy(),
                       "slangPath",
                       "slangs dictionary path",
@@ -454,32 +464,28 @@ def __init__(self,
         self.setParams(**kwargs)
 
     def setCorpusPath(self, value):
-        self._set(corpusPath=value)
-        return self
+        return self._set(corpusPath=value)
 
     def setCorpusFormat(self, value):
-        self._set(corpusFormat=value)
-        return self
+        return self._set(corpusFormat=value)
+
+    def setTokenPattern(self, value):
+        return self._set(tokenPattern=value)
 
     def setDictPath(self, value):
-        self._set(dictPath=value)
-        return self
+        return self._set(dictPath=value)
 
     def setSlangPath(self, value):
-        self._set(slangPath=value)
-        return self
+        return self._set(slangPath=value)
 
     def setCaseSensitive(self, value):
-        self._set(caseSensitive=value)
-        return self
+        return self._set(caseSensitive=value)
 
     def setDoubleVariants(self, value):
-        self._set(doubleVariants=value)
-        return self
+        return self._set(doubleVariants=value)
 
     def setShortCircuit(self, value):
-        self._set(shortCircuit=value)
-        return self
+        return self._set(shortCircuit=value)
 
     def setParams(self,
                   dictPath="/spell/words.txt",
 
@@ -2,6 +2,7 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
 
 import com.johnsnowlabs.nlp.AnnotatorApproach
 import com.johnsnowlabs.nlp.AnnotatorType._
+import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.Dataset
 
@@ -10,12 +11,17 @@ class DependencyParser(override val uid: String) extends AnnotatorApproach[Depen
 
   def this() = this(Identifiable.randomUID(DEPENDENCY))
 
+  val sourcePath = new Param[String](this, "sourcePath", "source file for dependency model")
+
+  def setSourcePath(value: String): this.type = set(sourcePath, value)
+
   override val annotatorType = DEPENDENCY
 
   override val requiredAnnotatorTypes = Array(DOCUMENT, POS, TOKEN)
 
   override def train(dataset: Dataset[_]): DependencyParserModel = {
     new DependencyParserModel()
+      .setSourcePath($(sourcePath))
   }
 }
 
 
@@ -2,9 +2,10 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
 
 import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
 import com.johnsnowlabs.nlp.AnnotatorType._
-import com.johnsnowlabs.nlp.annotators.common.{DependencyParsed, DependencyParsedSentence, PosTagged, TaggedSentence}
+import com.johnsnowlabs.nlp.annotators.common.{DependencyParsed, DependencyParsedSentence, PosTagged}
 import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
 import com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition._
+import org.apache.spark.ml.param.Param
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 
 class DependencyParserModel(override val uid: String) extends AnnotatorModel[DependencyParserModel] {
@@ -14,9 +15,13 @@ class DependencyParserModel(override val uid: String) extends AnnotatorModel[Dep
 
   override val requiredAnnotatorTypes =  Array[String](DOCUMENT, POS, TOKEN)
 
+  val sourcePath = new Param[String](this, "sourcePath", "source file for dependency model")
+
+  def setSourcePath(value: String): this.type = set(sourcePath, value)
+
   def tag(sentence: PosTaggedSentence): DependencyParsedSentence = {
     val model = new GreedyTransitionApproach()
-    model.parse(sentence)
+    model.parse(sentence, $(sourcePath))
   }
 
   override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
 
@@ -3,16 +3,17 @@ package com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition
 import com.johnsnowlabs.nlp.annotators.common.{DependencyParsedSentence, WordWithDependency}
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
+
 import scala.collection.mutable
 
 /**
   * Parser based on the code of Matthew Honnibal and Martin Andrews
   */
 class GreedyTransitionApproach {
 
-  def parse(posTagged: PosTaggedSentence, format: String = "TXT"): DependencyParsedSentence = {
+  def parse(posTagged: PosTaggedSentence, source: String, format: String = "TXT"): DependencyParsedSentence = {
     val parser = new Parser
-    parser.perceptron.load(ResourceHelper.parseLinesText("/dependency_parser/models/dep-model.txt", format.toUpperCase).toIterator)
+    parser.perceptron.load(ResourceHelper.parseLinesText(source, format.toUpperCase).toIterator)
     val sentence: Sentence = posTagged.indexedTaggedWords
       .map { item => WordData(item.word, item.tag) }.toList
     val dependencies = parser.parse(sentence)
 
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.sda.vivekn
 
 import com.johnsnowlabs.nlp.AnnotatorApproach
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
-import org.apache.spark.ml.param.{BooleanParam, Param}
+import org.apache.spark.ml.param.{IntParam, Param}
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.Dataset
 
@@ -25,8 +25,10 @@ class ViveknSentimentApproach(override val uid: String)
     */
   val positiveSourcePath = new Param[String](this, "positiveSource", "source file for positive sentences")
   val negativeSourcePath = new Param[String](this, "negativeSource", "source file for negative sentences")
-  val pruneCorpus = new BooleanParam(this, "pruneCorpus", "set to false if training corpus is small")
-  setDefault(pruneCorpus, true)
+  val pruneCorpus = new IntParam(this, "pruneCorpus", "Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1")
+  val tokenPattern = new Param[String](this, "tokenPattern", "Regex pattern to use in tokenization of corpus. Defaults \\S+")
+  setDefault(pruneCorpus, 1)
+  setDefault(tokenPattern, "\\S+")
 
   def this() = this(Identifiable.randomUID("VIVEKN"))
 
@@ -38,47 +40,28 @@ class ViveknSentimentApproach(override val uid: String)
 
   def setNegativeSourcePath(value: String): this.type = set(negativeSourcePath, value)
 
-  def setCorpusPrune(value: Boolean): this.type = set(pruneCorpus, value)
+  def setCorpusPrune(value: Int): this.type = set(pruneCorpus, value)
+
+  def setTokenPattern(value: String): this.type = set(tokenPattern, value)
 
   override def train(dataset: Dataset[_]): ViveknSentimentModel = {
 
-    var positive: MMap[String, Int] = ResourceHelper.wordCount(
-      $(positiveSourcePath),
-      "txt",
-      clean=false,
-      f=Some(w => ViveknSentimentApproach.negateSequence(w))
-    )
-    var negative: MMap[String, Int] = ResourceHelper.wordCount(
-      $(negativeSourcePath),
-      "txt",
-      clean=false,
-      f=Some(w => ViveknSentimentApproach.negateSequence(w))
+    val fromPositive: (MMap[String, Int], MMap[String, Int]) = ResourceHelper.ViveknWordCount(
+      source=$(positiveSourcePath),
+      tokenPattern=$(tokenPattern),
+      prune=$(pruneCorpus),
+      f=w => ViveknSentimentApproach.negateSequence(w)
     )
 
-    /** add negated words */
-    negative = ResourceHelper.wordCount(
-      $(positiveSourcePath),
-      "txt",
-      m=negative,
-      clean=false,
-      prefix=Some("not_"),
-      f=Some(w => ViveknSentimentApproach.negateSequence(w))
-    )
-    positive = ResourceHelper.wordCount(
-      $(negativeSourcePath),
-      "txt",
-      m=positive,
-      clean=false,
-      prefix=Some("not_"),
-      f=Some(w => ViveknSentimentApproach.negateSequence(w))
+    val (negative, positive) = ResourceHelper.ViveknWordCount(
+      source=$(negativeSourcePath),
+      tokenPattern=$(tokenPattern),
+      prune=$(pruneCorpus),
+      f=w => ViveknSentimentApproach.negateSequence(w),
+      fromPositive._2,
+      fromPositive._1
     )
 
-    /** remove features that appear only once */
-    if ($(pruneCorpus)) {
-      positive = positive.filter { case (_, count) => count > 1 }
-      negative = negative.filter { case (_, count) => count > 1 }
-    }
-
     val positiveTotals = positive.values.sum
     val negativeTotals = negative.values.sum
 
 
@@ -19,9 +19,11 @@ class NorvigSweetingApproach(override val uid: String)
   val corpusFormat = new Param[String](this, "corpusFormat", "dataset corpus format. txt or txtds allowed only")
   val dictPath = new Param[String](this, "dictPath", "path to dictionary of words")
   val slangPath = new Param[String](this, "slangPath", "path to custom dictionaries")
+  val tokenPattern = new Param[String](this, "tokenPattern", "Regex pattern to use in tokenization of corpus. Defaults [a-zA-Z]+")
 
   setDefault(dictPath, "/spell/words.txt")
   setDefault(corpusFormat, "TXT")
+  setDefault(tokenPattern, "[a-zA-Z]+")
 
   setDefault(caseSensitive, false)
   setDefault(doubleVariants, false)
@@ -41,11 +43,13 @@ class NorvigSweetingApproach(override val uid: String)
 
   def setSlangPath(value: String): this.type = set(slangPath, value)
 
+  def setTokenPattern(value: String): this.type = set(tokenPattern, value)
+
   override def train(dataset: Dataset[_]): NorvigSweetingModel = {
-    val loadWords = ResourceHelper.wordCount($(dictPath), $(corpusFormat).toUpperCase)
+    val loadWords = ResourceHelper.wordCount($(dictPath), $(corpusFormat).toUpperCase, $(tokenPattern))
     val corpusWordCount =
       if (get(corpusPath).isDefined) {
-        ResourceHelper.wordCount($(corpusPath), $(corpusFormat).toUpperCase)
+        ResourceHelper.wordCount($(corpusPath), $(corpusFormat).toUpperCase, $(tokenPattern))
       } else {
       Map.empty[String, Int]
       }