Add text_key and label_key to TextClassificationJsonReader (#5005)

RyujiTamaki · epwalsh · web-flow · commit c5c9edf0977d · 2021-02-23T09:47:53.000-08:00
* Add text_key and label_key to TextClassificationJsonReader

* Update CHANGELOG.md

* Remove unnecessary test

* Apply suggestions from code review

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;

Co-authored-by: Evan Pete Walsh &lt;epwalsh10@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added a way to specify extra parameters to the predictor in an `allennlp predict` call.
 - Added a way to initialize a `Vocabulary` from transformers models.
 - Added an example for fields of type `ListField[TextField]` to `apply_token_indexers` API docs.
+- Added `text_key` and `label_key` parameters to `TextClassificationJsonReader` class.
 
 ### Fixed
 
diff --git a/allennlp/data/dataset_readers/text_classification_json.py b/allennlp/data/dataset_readers/text_classification_json.py
@@ -17,7 +17,6 @@
 class TextClassificationJsonReader(DatasetReader):
     """
     Reads tokens and their labels from a labeled text classification dataset.
-    Expects a "text" field and a "label" field in JSON format.
 
     The output of `read` is a list of `Instance` s with the fields:
         tokens : `TextField` and
@@ -44,6 +43,10 @@ class TextClassificationJsonReader(DatasetReader):
     skip_label_indexing : `bool`, optional (default = `False`)
         Whether or not to skip label indexing. You might want to skip label indexing if your
         labels are numbers, so the dataset reader doesn't re-number them starting from 0.
+    text_key: `str`, optional (default=`"text"`)
+        The key name of the source field in the JSON data file.
+    label_key: `str`, optional (default=`"label"`)
+        The key name of the target field in the JSON data file.
     """
 
     def __init__(
@@ -53,6 +56,8 @@ def __init__(
         segment_sentences: bool = False,
         max_sequence_length: int = None,
         skip_label_indexing: bool = False,
+        text_key: str = "text",
+        label_key: str = "label",
         **kwargs,
     ) -> None:
         super().__init__(
@@ -63,6 +68,8 @@ def __init__(
         self._max_sequence_length = max_sequence_length
         self._skip_label_indexing = skip_label_indexing
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        self._text_key = text_key
+        self._label_key = label_key
         if self._segment_sentences:
             self._sentence_segmenter = SpacySentenceSplitter()
 
@@ -73,8 +80,8 @@ def _read(self, file_path):
                 if not line:
                     continue
                 items = json.loads(line)
-                text = items["text"]
-                label = items.get("label")
+                text = items[self._text_key]
+                label = items.get(self._label_key)
                 if label is not None:
                     if self._skip_label_indexing:
                         try: