JohnSnowLabs
diff --git a/‎CHANGELOG
Lines changed: 22 additions & 0 deletions b/‎CHANGELOG
Lines changed: 22 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 7 additions & 7 deletions b/‎README.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎build.sbt
Lines changed: 4 additions & 1 deletion b/‎build.sbt
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/quickstart.html
Lines changed: 5 additions & 5 deletions b/‎docs/quickstart.html
Lines changed: 5 additions & 5 deletions
diff --git a/‎python/example/crf-ner/ner.ipynb
Lines changed: 6 additions & 8 deletions b/‎python/example/crf-ner/ner.ipynb
Lines changed: 6 additions & 8 deletions
diff --git a/‎python/example/entities-extractor/extractor.ipynb
Lines changed: 17 additions & 10 deletions b/‎python/example/entities-extractor/extractor.ipynb
Lines changed: 17 additions & 10 deletions
@@ -1,3 +1,25 @@
+========
+1.4.0
+========
+---------------
+New features
+---------------
+
+* ExternalResource helpers used to represents external data information. Such information includes the format,
+delimiters and how to read it.
+* SpellChecker, ViveknSentiment and POS Perceptron can now train from the dataset passed to fit().
+  This is more "spark"-like as it should always be. New params included as required.
+
+---------------
+Enhancements
+---------------
+
+* ResourceHelper now has an improved SourceStream class which allows for more consistent HDFS/Filesystem reading by using
+more of the Hadoop APIs.
+* application.conf is a global setting and be overriden.
+* PySpark API improved by creating AnnotatorApproach and AnnotatorModel  classes
+* EntityMatcher now uses recursive Pipelines
+
 ========
 1.3.0
 ========
 
@@ -10,18 +10,18 @@ Take a look at our official spark-nlp page: http://nlp.johnsnowlabs.com/ for use
 
 This library has been uploaded to the spark-packages repository https://spark-packages.org/package/JohnSnowLabs/spark-nlp .
 
-To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:1.3.0` to you spark command
+To use the most recent version just add the `--packages JohnSnowLabs:spark-nlp:1.4.0` to you spark command
 
 ```sh
-spark-shell --packages JohnSnowLabs:spark-nlp:1.3.0
+spark-shell --packages JohnSnowLabs:spark-nlp:1.4.0
 ```
 
 ```sh
-pyspark --packages JohnSnowLabs:spark-nlp:1.3.0
+pyspark --packages JohnSnowLabs:spark-nlp:1.4.0
 ```
 
 ```sh
-spark-submit --packages JohnSnowLabs:spark-nlp:1.3.0
+spark-submit --packages JohnSnowLabs:spark-nlp:1.4.0
 ```
 
 If you want to use and old version check the spark-packages websites to see all the releases.
@@ -36,19 +36,19 @@ Our package is deployed to maven central. In order to add this package as a depe
 <dependency>
   <groupId>com.johnsnowlabs.nlp</groupId>
   <artifactId>spark-nlp_2.11</artifactId>
-  <version>1.3.0</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
 #### SBT
 ```sbtshell
-libraryDependencies += "com.johnsnowlabs.nlp" % "spark-nlp_2.11" % "1.3.0"
+libraryDependencies += "com.johnsnowlabs.nlp" % "spark-nlp_2.11" % "1.4.0"
 ```
 
 If you are using `scala 2.11`
 
 ```sbtshell
-libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "1.3.0"
+libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "1.4.0"
 ```
 
 ## Using the jar manually 
 
@@ -7,7 +7,7 @@ name := "spark-nlp"
 
 organization := "com.johnsnowlabs.nlp"
 
-version := "1.3.0"
+version := "1.4.0"
 
 scalaVersion := scalaVer
 
@@ -110,6 +110,9 @@ testOptions in Test += Tests.Argument("-oF")
 /** Disables tests in assembly */
 test in assembly := {}
 
+/** Publish test artificat **/
+publishArtifact in Test := true
+
 /** Copies the assembled jar to the pyspark/lib dir **/
 lazy val copyAssembledJar = taskKey[Unit]("Copy assembled jar to pyspark/lib")
 
 
@@ -95,16 +95,16 @@ <h2 class="section-title">Requirements</h2>
                                 depending on your desired use case:
                                 </p>
                                 </p>
-                                <pre><code class="language-python">spark-shell --packages JohnSnowLabs:spark-nlp:1.3.0
-pyspark --packages JohnSnowLabs:spark-nlp:1.3.0
-spark-submit --packages JohnSnowLabs:spark-nlp:1.3.0
+                                <pre><code class="language-python">spark-shell --packages JohnSnowLabs:spark-nlp:1.4.0
+pyspark --packages JohnSnowLabs:spark-nlp:1.4.0
+spark-submit --packages JohnSnowLabs:spark-nlp:1.4.0
 </code></pre>
                                 <p>
                                     Another way to use the library is by appending jar file into spark classpath,
                                     which can be downloaded
-                                    <a href="http://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.11/1.3.0/spark-nlp_2.11-1.3.0.jar">here</a>
+                                    <a href="http://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.11/1.4.0/spark-nlp_2.11-1.4.0.jar">here</a>
                                     then, run spark-shell or spark-submit with appropriate <b>--jars
-                                    /path/to/spark-nlp_2.11-1.3.0.jar</b> to use the library in spark.
+                                    /path/to/spark-nlp_2.11-1.4.0.jar</b> to use the library in spark.
                                 </p>
                                 <p>
                                     For further alternatives and documentation check out our README page in <a href="https://github.com/JohnSnowLabs/spark-nlp">GitHub</a>.
 
@@ -8,7 +8,8 @@
    },
    "outputs": [],
    "source": [
-    "                                                                                                                                                                                                                                                                                                    import sys\n",
+    "import os\n",
+    "import sys\n",
     "sys.path.append('../../')\n",
     "\n",
     "from pyspark.sql import SparkSession\n",
@@ -91,9 +92,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "\n",
@@ -110,7 +109,6 @@
     "  .setOutputCol(\"token\")\n",
     "\n",
     "posTagger = PerceptronApproach()\\\n",
-    "  .setCorpusPath(\"anc-pos-corpus/\")\\\n",
     "  .setIterations(5)\\\n",
     "  .setInputCols([\"token\", \"document\"])\\\n",
     "  .setOutputCol(\"pos\")\n",
@@ -123,8 +121,8 @@
     "  .setMinEpochs(1)\\\n",
     "  .setMaxEpochs(20)\\\n",
     "  .setLossEps(1e-3)\\\n",
-    "  .setDicts([\"ner-corpus/dict.txt\"])\\\n",
-    "  .setDatasetPath(\"eng.train\")\\\n",
+    "  .setExternalFeatures(\"file://\" + os.getcwd() + \"/../../../src/main/resources/ner-corpus/dict.txt\")\\\n",
+    "  .setExternalDataset(\"file://\" + os.getcwd() + \"/eng.train\")\\\n",
     "  .setL2(1)\\\n",
     "  .setC0(1250000)\\\n",
     "  .setRandomSeed(0)\\\n",
@@ -154,7 +152,7 @@
     "#Load the input data to be annotated\n",
     "data = spark. \\\n",
     "        read. \\\n",
-    "        parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n",
+    "        parquet(\"file://\" + os.getcwd() + \"/../../../src/test/resources/sentiment.parquet\"). \\\n",
     "        limit(1000)\n",
     "data.cache()\n",
     "data.count()\n",
 
@@ -8,6 +8,7 @@
    },
    "outputs": [],
    "source": [
+    "import os\n",
     "import sys\n",
     "sys.path.append('../../')\n",
     "\n",
@@ -40,9 +41,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import time\n",
@@ -60,13 +59,14 @@
     "  .setOutputCol(\"token\")\n",
     "\n",
     "extractor = EntityExtractor()\\\n",
-    "  .setEntitiesPath(\"entities.txt\")\\\n",
+    "  .setEntities(\"file://\" + os.getcwd() + \"/entities.txt\")\\\n",
     "  .setInputCols([\"token\", \"sentence\"])\\\n",
     "  .setOutputCol(\"entites\")\n",
     "\n",
     "finisher = Finisher() \\\n",
     "    .setInputCols([\"entites\"]) \\\n",
-    "    .setIncludeKeys(True)\n",
+    "    .setIncludeKeys(False) \\\n",
+    "    .setCleanAnnotations(True)\n",
     "\n",
     "pipeline = Pipeline(\n",
     "    stages = [\n",
@@ -87,11 +87,11 @@
     "#Load the input data to be annotated\n",
     "data = spark. \\\n",
     "        read. \\\n",
-    "        parquet(\"../../../src/test/resources/sentiment.parquet\"). \\\n",
+    "        parquet(\"file://\" + os.getcwd() + \"../../../../src/test/resources/sentiment.parquet\"). \\\n",
     "        limit(1000)\n",
     "data.cache()\n",
     "data.count()\n",
-    "data.show()"
+    "data.show(20)"
    ]
   },
   {
@@ -120,9 +120,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extracted.select(\"finished_entites\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "pipeline.write().overwrite().save(\"./extractor_pipeline\")\n",