Skip to content

Commit d3b9086

Browse files
committed
2 parents b81e95c + 9cb02ff commit d3b9086

File tree

9 files changed

+788
-325
lines changed

9 files changed

+788
-325
lines changed

python/example/crf-ner/ner.ipynb

Lines changed: 169 additions & 299 deletions
Large diffs are not rendered by default.

python/example/crf-ner/ner_benchmark.ipynb

Lines changed: 535 additions & 0 deletions
Large diffs are not rendered by default.

python/example/vivekn-sentiment/sentiment.ipynb

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@
3838
{
3939
"cell_type": "code",
4040
"execution_count": null,
41-
"metadata": {},
41+
"metadata": {
42+
"collapsed": true
43+
},
4244
"outputs": [],
4345
"source": [
4446
"#Load the input data to be annotated\n",
@@ -158,7 +160,9 @@
158160
{
159161
"cell_type": "code",
160162
"execution_count": null,
161-
"metadata": {},
163+
"metadata": {
164+
"collapsed": true
165+
},
162166
"outputs": [],
163167
"source": [
164168
"pipeline = Pipeline(stages=[\n",
@@ -178,7 +182,9 @@
178182
{
179183
"cell_type": "code",
180184
"execution_count": null,
181-
"metadata": {},
185+
"metadata": {
186+
"collapsed": true
187+
},
182188
"outputs": [],
183189
"source": [
184190
"for r in sentiment_data.take(5):\n",
@@ -211,7 +217,9 @@
211217
{
212218
"cell_type": "code",
213219
"execution_count": null,
214-
"metadata": {},
220+
"metadata": {
221+
"collapsed": true
222+
},
215223
"outputs": [],
216224
"source": [
217225
"Pipeline.read().load(\"./ps\")\n",
@@ -231,7 +239,7 @@
231239
"metadata": {
232240
"anaconda-cloud": {},
233241
"kernelspec": {
234-
"display_name": "Python 3",
242+
"display_name": "Python [default]",
235243
"language": "python",
236244
"name": "python3"
237245
},

src/main/resources/log4j.properties

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
33
log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
44
log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n
55

6-
log4j.logger.AnnotatorLogger=WARNING
6+
log4j.logger.AnnotatorLogger=WARNING
7+
log4j.logger.CRF=INFO

src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/DictionaryFeatures.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package com.johnsnowlabs.nlp.annotators.ner.crf
22

3+
import com.johnsnowlabs.nlp.util.io.ResourceHelper
4+
35
import scala.io.Source
46

57
case class DictionaryFeatures(dict: Map[String, String])
@@ -31,8 +33,8 @@ object DictionaryFeatures {
3133
}
3234

3335
private def read(path: String): Iterator[(String, String)] = {
34-
Source.fromFile(path).getLines().map{
35-
line =>
36+
ResourceHelper.SourceStream(path)
37+
.content.getLines().map{line =>
3638
val items = line.split(":")
3739
require(items.size == 2)
3840
(items(0), items(1))

src/main/scala/com/johnsnowlabs/nlp/annotators/ner/crf/NerCrfApproach.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class NerCrfApproach(override val uid: String) extends AnnotatorApproach[NerCrfM
8989
.setOutputCol("token")
9090

9191
val posTagger = new PerceptronApproach()
92-
.setCorpusPath("/anc-pos-corpus/")
92+
.setCorpusPath("anc-pos-corpus/")
9393
.setNIterations(10)
9494
.setInputCols("token", "document")
9595
.setOutputCol("pos")

src/main/scala/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronApproach.scala

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
package com.johnsnowlabs.nlp.annotators.pos.perceptron
22

33
import java.io.File
4+
import java.nio.file.Paths
45

56
import com.johnsnowlabs.nlp.AnnotatorApproach
67
import com.johnsnowlabs.nlp.annotators.common.{TaggedSentence, TaggedWord}
8+
import com.johnsnowlabs.nlp.util.io.ResourceHelper
79
import com.johnsnowlabs.nlp.util.io.ResourceHelper.{SourceStream, pathIsDirectory}
810
import com.typesafe.config.{Config, ConfigFactory}
911
import org.apache.spark.ml.param.{IntParam, Param}
@@ -218,15 +220,12 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] {
218220
.flatMap(fileName => parsePOSCorpusFromSource(fileName.toString, tagSeparator))
219221
} catch {
220222
case _: NullPointerException =>
221-
val sourceStream = SourceStream(dirName)
222-
val res = sourceStream
223-
.content
224-
.getLines()
223+
ResourceHelper.listDirectory(dirName)
225224
.take(fileLimit)
226-
.flatMap(fileName => parsePOSCorpusFromSource(dirName + "/" + fileName, tagSeparator))
225+
.flatMap{fileName =>
226+
val path = Paths.get(dirName, fileName)
227+
parsePOSCorpusFromSource(path.toString, tagSeparator)}
227228
.toArray
228-
sourceStream.close()
229-
res
230229
}
231230
}
232231

@@ -246,7 +245,7 @@ object PerceptronApproach extends DefaultParamsReadable[PerceptronApproach] {
246245
if (pathIsDirectory(dirOrFilePath)) parsePOSCorpusFromDir(dirOrFilePath, posSeparator, fileLimit)
247246
else parsePOSCorpusFromSource(dirOrFilePath, posSeparator)
248247
}
249-
if (result.isEmpty) throw new Exception("Empty corpus for POS")
248+
if (result.isEmpty) throw new Exception(s"Empty corpus for POS in $posDirOrFilePath")
250249
result
251250
}
252251

src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ import org.apache.spark.sql.SparkSession
1212
import scala.collection.mutable.{ArrayBuffer, Map => MMap}
1313
import scala.io.Source
1414

15+
import java.net.URLDecoder
16+
import java.util.jar.JarFile
17+
1518

1619
/**
1720
* Created by saif on 28/04/17.
@@ -24,15 +27,59 @@ object ResourceHelper {
2427

2528
private val spark: SparkSession = SparkSession.builder().getOrCreate()
2629

30+
31+
def listDirectory(path: String): Seq[String] = {
32+
var dirURL = getClass.getResource(path)
33+
34+
if (dirURL == null)
35+
dirURL = getClass.getClassLoader.getResource(path)
36+
37+
if (dirURL != null && dirURL.getProtocol.equals("file")) {
38+
/* A file path: easy enough */
39+
return new File(dirURL.toURI).list().sorted
40+
} else if (dirURL == null) {
41+
/* path not in resources and not in disk */
42+
throw new FileNotFoundException(path)
43+
}
44+
45+
if (dirURL.getProtocol.equals("jar")) {
46+
/* A JAR path */
47+
val jarPath = dirURL.getPath.substring(5, dirURL.getPath.indexOf("!")) //strip out only the JAR file
48+
val jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"))
49+
val entries = jar.entries()
50+
val result = new ArrayBuffer[String]()
51+
52+
val pathToCheck = path.replaceFirst("/", "")
53+
while(entries.hasMoreElements) {
54+
val name = entries.nextElement().getName.replaceFirst("/", "")
55+
if (name.startsWith(pathToCheck)) { //filter according to the path
56+
var entry = name.substring(pathToCheck.length())
57+
val checkSubdir = entry.indexOf("/")
58+
if (checkSubdir >= 0) {
59+
// if it is a subdirectory, we just return the directory name
60+
entry = entry.substring(0, checkSubdir)
61+
}
62+
if (entry.nonEmpty)
63+
result.append(entry)
64+
}
65+
}
66+
return result.distinct.sorted
67+
}
68+
69+
throw new UnsupportedOperationException(s"Cannot list files for URL $dirURL")
70+
}
71+
2772
/** Structure for a SourceStream coming from compiled content */
2873
case class SourceStream(resource: String) {
29-
val pipe: Option[InputStream] = try {
30-
getClass.getResourceAsStream(resource).close()
31-
Some(getClass.getResourceAsStream(resource))
32-
} catch {
33-
case _: NullPointerException => None
74+
val pipe: Option[InputStream] = {
75+
var stream = getClass.getResourceAsStream(resource)
76+
if (stream == null)
77+
stream = getClass.getClassLoader.getResourceAsStream(resource)
78+
Option(stream)
3479
}
35-
val content: Source = pipe.map(p => Source.fromInputStream(p)("UTF-8")).getOrElse(Source.fromFile(resource, "UTF-8"))
80+
val content: Source = pipe.map(p => {
81+
Source.fromInputStream(p)("UTF-8")
82+
}).getOrElse(Source.fromFile(resource, "UTF-8"))
3683
def close(): Unit = {
3784
content.close()
3885
pipe.foreach(_.close())

src/test/scala/com/johnsnowlabs/ml/crf/CoNLL2003PipelineTest.scala

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ object CoNLL2003PipelineTest extends App {
3939
.setOutputCol("token")
4040

4141
val posTagger = new PerceptronApproach()
42-
.setCorpusPath("/anc-pos-corpus/")
42+
.setCorpusPath("anc-pos-corpus/")
4343
.setNIterations(10)
4444
.setInputCols("token", "document")
4545
.setOutputCol("pos")
@@ -55,9 +55,10 @@ object CoNLL2003PipelineTest extends App {
5555
val nerTagger = new NerCrfApproach()
5656
.setInputCols("sentence", "token", "pos")
5757
.setLabelColumn("label")
58-
.setC0(1250000)
58+
.setDatsetPath("eng.train")
59+
.setC0(2250000)
5960
.setRandomSeed(100)
60-
.setMaxEpochs(10)
61+
.setMaxEpochs(20)
6162
.setOutputCol("ner")
6263

6364
getPosStages() :+ nerTagger

0 commit comments

Comments
 (0)