Skip to content

Commit bdfd39c

Browse files
authored
Merge pull request #82 from JohnSnowLabs/vivekn-sentiment-analysis-training-imp
Improved Vivekn sentiment analysis training performance
2 parents c1b3e11 + bc4f41d commit bdfd39c

File tree

11 files changed

+127
-93
lines changed

11 files changed

+127
-93
lines changed

python/sparknlp/annotator.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -352,41 +352,46 @@ class ViveknSentimentApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, Ann
352352

353353
pruneCorpus = Param(Params._dummy(),
354354
"pruneCorpus",
355-
"whether to prune low frequency words",
356-
typeConverter=TypeConverters.toBoolean)
355+
"Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1",
356+
typeConverter=TypeConverters.toInt)
357+
358+
tokenPattern = Param(Params._dummy(),
359+
"negativeSource",
360+
"Regex pattern to use in tokenization of corpus. Defaults \\S+",
361+
typeConverter=TypeConverters.toString)
357362

358363
@keyword_only
359364
def __init__(self,
360365
positiveSource="",
361366
negativeSource="",
362-
pruneCorpus=False
367+
pruneCorpus=1,
368+
tokenPattern="\\S+"
363369
):
364370
super(ViveknSentimentApproach, self).__init__()
365371
self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach", self.uid)
366372
kwargs = self._input_kwargs
367373
self._setDefault(
368374
positiveSource="",
369375
negativeSource="",
370-
pruneCorpus=False
376+
pruneCorpus=1,
377+
tokenPattern="\\S+"
371378
)
372379
self.setParams(**kwargs)
373380

374381
def setPositiveSource(self, value):
375-
self._set(positiveSource=value)
376-
return self
382+
return self._set(positiveSource=value)
377383

378384
def setNegativeSource(self, value):
379-
self._set(negativeSource=value)
380-
return self
385+
return self._set(negativeSource=value)
381386

382387
def setPruneCorpus(self, value):
383-
self._set(pruneCorpus=value)
384-
return self
388+
return self._set(pruneCorpus=value)
385389

386390
def setParams(self,
387391
positiveSource="",
388392
negativeSource="",
389-
pruneCorpus=False):
393+
pruneCorpus=1,
394+
tokenPattern="\\S+"):
390395
kwargs = self._input_kwargs
391396
return self._set(**kwargs)
392397

@@ -414,6 +419,11 @@ class NorvigSweetingApproach(JavaEstimator, JavaMLWritable, JavaMLReadable, Anno
414419
"dataset corpus format. txt or txtds allowed only",
415420
typeConverter=TypeConverters.toString)
416421

422+
tokenPattern = Param(Params._dummy(),
423+
"tokenPattern",
424+
"Regex pattern to use in tokenization of corpus. Defaults [a-zA-Z]+",
425+
typeConverter=TypeConverters.toString)
426+
417427
slangPath = Param(Params._dummy(),
418428
"slangPath",
419429
"slangs dictionary path",
@@ -454,32 +464,28 @@ def __init__(self,
454464
self.setParams(**kwargs)
455465

456466
def setCorpusPath(self, value):
457-
self._set(corpusPath=value)
458-
return self
467+
return self._set(corpusPath=value)
459468

460469
def setCorpusFormat(self, value):
461-
self._set(corpusFormat=value)
462-
return self
470+
return self._set(corpusFormat=value)
471+
472+
def setTokenPattern(self, value):
473+
return self._set(tokenPattern=value)
463474

464475
def setDictPath(self, value):
465-
self._set(dictPath=value)
466-
return self
476+
return self._set(dictPath=value)
467477

468478
def setSlangPath(self, value):
469-
self._set(slangPath=value)
470-
return self
479+
return self._set(slangPath=value)
471480

472481
def setCaseSensitive(self, value):
473-
self._set(caseSensitive=value)
474-
return self
482+
return self._set(caseSensitive=value)
475483

476484
def setDoubleVariants(self, value):
477-
self._set(doubleVariants=value)
478-
return self
485+
return self._set(doubleVariants=value)
479486

480487
def setShortCircuit(self, value):
481-
self._set(shortCircuit=value)
482-
return self
488+
return self._set(shortCircuit=value)
483489

484490
def setParams(self,
485491
dictPath="/spell/words.txt",

src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParser.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
22

33
import com.johnsnowlabs.nlp.AnnotatorApproach
44
import com.johnsnowlabs.nlp.AnnotatorType._
5+
import org.apache.spark.ml.param.Param
56
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
67
import org.apache.spark.sql.Dataset
78

@@ -10,12 +11,17 @@ class DependencyParser(override val uid: String) extends AnnotatorApproach[Depen
1011

1112
def this() = this(Identifiable.randomUID(DEPENDENCY))
1213

14+
val sourcePath = new Param[String](this, "sourcePath", "source file for dependency model")
15+
16+
def setSourcePath(value: String): this.type = set(sourcePath, value)
17+
1318
override val annotatorType = DEPENDENCY
1419

1520
override val requiredAnnotatorTypes = Array(DOCUMENT, POS, TOKEN)
1621

1722
override def train(dataset: Dataset[_]): DependencyParserModel = {
1823
new DependencyParserModel()
24+
.setSourcePath($(sourcePath))
1925
}
2026
}
2127

src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModel.scala

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ package com.johnsnowlabs.nlp.annotators.parser.dep
22

33
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
44
import com.johnsnowlabs.nlp.AnnotatorType._
5-
import com.johnsnowlabs.nlp.annotators.common.{DependencyParsed, DependencyParsedSentence, PosTagged, TaggedSentence}
5+
import com.johnsnowlabs.nlp.annotators.common.{DependencyParsed, DependencyParsedSentence, PosTagged}
66
import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
77
import com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition._
8+
import org.apache.spark.ml.param.Param
89
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
910

1011
class DependencyParserModel(override val uid: String) extends AnnotatorModel[DependencyParserModel] {
@@ -14,9 +15,13 @@ class DependencyParserModel(override val uid: String) extends AnnotatorModel[Dep
1415

1516
override val requiredAnnotatorTypes = Array[String](DOCUMENT, POS, TOKEN)
1617

18+
val sourcePath = new Param[String](this, "sourcePath", "source file for dependency model")
19+
20+
def setSourcePath(value: String): this.type = set(sourcePath, value)
21+
1722
def tag(sentence: PosTaggedSentence): DependencyParsedSentence = {
1823
val model = new GreedyTransitionApproach()
19-
model.parse(sentence)
24+
model.parse(sentence, $(sourcePath))
2025
}
2126

2227
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {

src/main/scala/com/johnsnowlabs/nlp/annotators/parser/dep/GreedyTransition/GreedyTransitionApproach.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,17 @@ package com.johnsnowlabs.nlp.annotators.parser.dep.GreedyTransition
33
import com.johnsnowlabs.nlp.annotators.common.{DependencyParsedSentence, WordWithDependency}
44
import com.johnsnowlabs.nlp.util.io.ResourceHelper
55
import com.johnsnowlabs.nlp.annotators.common.Annotated.PosTaggedSentence
6+
67
import scala.collection.mutable
78

89
/**
910
* Parser based on the code of Matthew Honnibal and Martin Andrews
1011
*/
1112
class GreedyTransitionApproach {
1213

13-
def parse(posTagged: PosTaggedSentence, format: String = "TXT"): DependencyParsedSentence = {
14+
def parse(posTagged: PosTaggedSentence, source: String, format: String = "TXT"): DependencyParsedSentence = {
1415
val parser = new Parser
15-
parser.perceptron.load(ResourceHelper.parseLinesText("/dependency_parser/models/dep-model.txt", format.toUpperCase).toIterator)
16+
parser.perceptron.load(ResourceHelper.parseLinesText(source, format.toUpperCase).toIterator)
1617
val sentence: Sentence = posTagged.indexedTaggedWords
1718
.map { item => WordData(item.word, item.tag) }.toList
1819
val dependencies = parser.parse(sentence)

src/main/scala/com/johnsnowlabs/nlp/annotators/sda/vivekn/ViveknSentimentApproach.scala

Lines changed: 20 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package com.johnsnowlabs.nlp.annotators.sda.vivekn
22

33
import com.johnsnowlabs.nlp.AnnotatorApproach
44
import com.johnsnowlabs.nlp.util.io.ResourceHelper
5-
import org.apache.spark.ml.param.{BooleanParam, Param}
5+
import org.apache.spark.ml.param.{IntParam, Param}
66
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
77
import org.apache.spark.sql.Dataset
88

@@ -25,8 +25,10 @@ class ViveknSentimentApproach(override val uid: String)
2525
*/
2626
val positiveSourcePath = new Param[String](this, "positiveSource", "source file for positive sentences")
2727
val negativeSourcePath = new Param[String](this, "negativeSource", "source file for negative sentences")
28-
val pruneCorpus = new BooleanParam(this, "pruneCorpus", "set to false if training corpus is small")
29-
setDefault(pruneCorpus, true)
28+
val pruneCorpus = new IntParam(this, "pruneCorpus", "Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1")
29+
val tokenPattern = new Param[String](this, "tokenPattern", "Regex pattern to use in tokenization of corpus. Defaults \\S+")
30+
setDefault(pruneCorpus, 1)
31+
setDefault(tokenPattern, "\\S+")
3032

3133
def this() = this(Identifiable.randomUID("VIVEKN"))
3234

@@ -38,47 +40,28 @@ class ViveknSentimentApproach(override val uid: String)
3840

3941
def setNegativeSourcePath(value: String): this.type = set(negativeSourcePath, value)
4042

41-
def setCorpusPrune(value: Boolean): this.type = set(pruneCorpus, value)
43+
def setCorpusPrune(value: Int): this.type = set(pruneCorpus, value)
44+
45+
def setTokenPattern(value: String): this.type = set(tokenPattern, value)
4246

4347
override def train(dataset: Dataset[_]): ViveknSentimentModel = {
4448

45-
var positive: MMap[String, Int] = ResourceHelper.wordCount(
46-
$(positiveSourcePath),
47-
"txt",
48-
clean=false,
49-
f=Some(w => ViveknSentimentApproach.negateSequence(w))
50-
)
51-
var negative: MMap[String, Int] = ResourceHelper.wordCount(
52-
$(negativeSourcePath),
53-
"txt",
54-
clean=false,
55-
f=Some(w => ViveknSentimentApproach.negateSequence(w))
49+
val fromPositive: (MMap[String, Int], MMap[String, Int]) = ResourceHelper.ViveknWordCount(
50+
source=$(positiveSourcePath),
51+
tokenPattern=$(tokenPattern),
52+
prune=$(pruneCorpus),
53+
f=w => ViveknSentimentApproach.negateSequence(w)
5654
)
5755

58-
/** add negated words */
59-
negative = ResourceHelper.wordCount(
60-
$(positiveSourcePath),
61-
"txt",
62-
m=negative,
63-
clean=false,
64-
prefix=Some("not_"),
65-
f=Some(w => ViveknSentimentApproach.negateSequence(w))
66-
)
67-
positive = ResourceHelper.wordCount(
68-
$(negativeSourcePath),
69-
"txt",
70-
m=positive,
71-
clean=false,
72-
prefix=Some("not_"),
73-
f=Some(w => ViveknSentimentApproach.negateSequence(w))
56+
val (negative, positive) = ResourceHelper.ViveknWordCount(
57+
source=$(negativeSourcePath),
58+
tokenPattern=$(tokenPattern),
59+
prune=$(pruneCorpus),
60+
f=w => ViveknSentimentApproach.negateSequence(w),
61+
fromPositive._2,
62+
fromPositive._1
7463
)
7564

76-
/** remove features that appear only once */
77-
if ($(pruneCorpus)) {
78-
positive = positive.filter { case (_, count) => count > 1 }
79-
negative = negative.filter { case (_, count) => count > 1 }
80-
}
81-
8265
val positiveTotals = positive.values.sum
8366
val negativeTotals = negative.values.sum
8467

src/main/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingApproach.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ class NorvigSweetingApproach(override val uid: String)
1919
val corpusFormat = new Param[String](this, "corpusFormat", "dataset corpus format. txt or txtds allowed only")
2020
val dictPath = new Param[String](this, "dictPath", "path to dictionary of words")
2121
val slangPath = new Param[String](this, "slangPath", "path to custom dictionaries")
22+
val tokenPattern = new Param[String](this, "tokenPattern", "Regex pattern to use in tokenization of corpus. Defaults [a-zA-Z]+")
2223

2324
setDefault(dictPath, "/spell/words.txt")
2425
setDefault(corpusFormat, "TXT")
26+
setDefault(tokenPattern, "[a-zA-Z]+")
2527

2628
setDefault(caseSensitive, false)
2729
setDefault(doubleVariants, false)
@@ -41,11 +43,13 @@ class NorvigSweetingApproach(override val uid: String)
4143

4244
def setSlangPath(value: String): this.type = set(slangPath, value)
4345

46+
def setTokenPattern(value: String): this.type = set(tokenPattern, value)
47+
4448
override def train(dataset: Dataset[_]): NorvigSweetingModel = {
45-
val loadWords = ResourceHelper.wordCount($(dictPath), $(corpusFormat).toUpperCase)
49+
val loadWords = ResourceHelper.wordCount($(dictPath), $(corpusFormat).toUpperCase, $(tokenPattern))
4650
val corpusWordCount =
4751
if (get(corpusPath).isDefined) {
48-
ResourceHelper.wordCount($(corpusPath), $(corpusFormat).toUpperCase)
52+
ResourceHelper.wordCount($(corpusPath), $(corpusFormat).toUpperCase, $(tokenPattern))
4953
} else {
5054
Map.empty[String, Int]
5155
}

0 commit comments

Comments
 (0)