OFA-Sys
diff --git a/‎README.md
+11-11 b/‎README.md
+11-11
diff --git a/‎docs/app.py
+21 b/‎docs/app.py
+21
diff --git a/‎docs/requirements_doc.txt
+2 b/‎docs/requirements_doc.txt
+2
diff --git a/‎docs/source/api/trainer.rst
+3 b/‎docs/source/api/trainer.rst
+3
diff --git a/‎docs/source/howto/add_module.rst
+2-2 b/‎docs/source/howto/add_module.rst
+2-2
diff --git a/‎docs/source/howto/train.rst
+1-1 b/‎docs/source/howto/train.rst
+1-1
diff --git a/‎docs/source/start/quickstart.rst
+8-8 b/‎docs/source/start/quickstart.rst
+8-8
diff --git a/‎docs/source/task/audio.rst
+2-2 b/‎docs/source/task/audio.rst
+2-2
diff --git a/‎docs/source/task/box.rst
+4-4 b/‎docs/source/task/box.rst
+4-4
diff --git a/‎docs/source/task/image.rst
+3-3 b/‎docs/source/task/image.rst
+3-3
diff --git a/‎docs/source/task/motion.rst
+62-3 b/‎docs/source/task/motion.rst
+62-3
diff --git a/‎docs/source/task/structural.rst
+1-1 b/‎docs/source/task/structural.rst
+1-1
@@ -141,7 +141,7 @@ OFASys enables multi-task multi-modal inference through the instruction alone. T
 <img src="https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/val2014/COCO_val2014_000000222628.jpg" width="400">
 
 ```python
-instruction = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
+instruction = '[IMAGE:img] what does the image describe?  -> [TEXT:cap]'
 data = {'img': "./COCO_val2014_000000222628.jpg"}
 output = model.inference(instruction, data=data)
 print(output.text)
@@ -152,7 +152,7 @@ print(output.text)
 <img src="https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg" width="400">
 
 ```python
-instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
+instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
 data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", "cap": "hand"}
 output = model.inference(instruction, data=data)
 output.save_box("output.jpg")
@@ -162,7 +162,7 @@ output.save_box("output.jpg")
 ### Text Summarization
 
 ```python
-instruction = '<BOS> what is the summary of article " [TEXT:src] "? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+instruction = 'what is the summary of article " [TEXT:src] "? -> [TEXT:tgt]'
 data = {'src': "poland 's main opposition party tuesday endorsed president lech walesa in an upcoming "
         "presidential run-off election after a reformed communist won the first round of voting ."}
 output = model.inference(instruction, data=data)
@@ -173,7 +173,7 @@ print(output.text)
 ### Table-to-Text Generation
 
 ```python
-instruction = '<BOS> structured knowledge: " [STRUCT:database,uncased] "  . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+instruction = 'structured knowledge: " [STRUCT:database,uncased] "  . how to describe the tripleset ? -> [TEXT:tgt]'
 data = {
      'database': [['Atlanta', 'OFFICIAL_POPULATION', '5,457,831'],
                   ['[TABLECONTEXT]', 'METROPOLITAN_AREA', 'Atlanta'],
@@ -184,13 +184,13 @@ data = {
  }
 output = model.inference(instruction, data=data, beam_size=1)
 print(output.text)
-# "atlanta is the metropolitan area in the united states in 2012."
+# "atlanta, united states has a population of 5,457,831 in 2012."
 ```
 
 ### Text-to-SQL Generation
 
 ```python
-instruction = '<BOS> " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+instruction = ' " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. -> [TEXT:tgt]'
 database = [
              ['concert_singer'],
              ['stadium', 'stadium_id , location , name , capacity , highest , lowest , average'],
@@ -201,21 +201,21 @@ database = [
 data = [
      {'src': 'What are the names, countries, and ages for every singer in descending order of age?', 'database': database},
      {'src': 'What are all distinct countries where singers above age 20 are from?', 'database': database},
-     {'src': 'What are the locations and names of all stations with capacity between 5000 and 10000?', 'database': database}
+     {'src': 'Show the name and the release year of the song by the youngest singer.', 'database': database}
  ]
 output = model.inference(instruction, data=data)
-print('\n'.join([o.text for o in output]))
+print('\n'.join(o.text for o in output))]
 # "select name, country, age from singer order by age desc"
 # "select distinct country from singer where age > 20"
-# "select location, name from stadium where capacity between 5000 and 10000"
+# "select song_name, song_release_year from singer order by age limit 1"
 ``` 
 
 ### Video Captioning
 
 <img src="https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/video.png" width="400">
 
 ```python
-instruction = '[VIDEO:video] <BOS> what does the video describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
+instruction = '[VIDEO:video] what does the video describe? -> [TEXT:cap]'
 data = {'video': './video7021.mp4'}
 output = model.inference(instruction, data=data)
 print(output.text)
@@ -230,7 +230,7 @@ print(output.text)
 </audio>
 
 ```python    
-instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
+instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
 data = {'wav': './1272-128104-0001.flac'}
 output = model.inference(instruction, data=data)
 print(output.text)
 
@@ -0,0 +1,21 @@
+import logging
+
+from flask import Flask
+from flask_cors import *
+
+app = Flask(__name__)
+CORS(app, resources=r'/*', supports_credentials=True)
+
+app = Flask(__name__, static_url_path='/', static_folder='build/html/')
+
+
+@app.route('/')
+@app.route('/<path:path>')
+def serve_sphinx_docs(path='index.html'):
+    return app.send_static_file(path)
+
+
+if __name__ != "__main__":
+    gunicorn_logger = logging.getLogger("gunicorn.error")
+    app.logger.handlers = gunicorn_logger.handlers
+    app.logger.setLevel(gunicorn_logger.level)
@@ -2,6 +2,8 @@ sphinx
 furo
 sphinx_rtd_theme
 sphinx-argparse
+flask
+flask_cors
 sphinxcontrib-video
 furo
 sphinx-copybutton
@@ -0,0 +1,3 @@
+============
+trainer(WIP)
+============
@@ -193,13 +193,13 @@ The original content of ``caption.yaml`` is:
 
     task_name: caption
     instruction:
-         - '[IMAGE:image] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:caption] <EOS>'
+         - '[IMAGE:image] what does the image describe? -> [TEXT:caption]'
 
 We can change to use ViT as the image adaptor instead of ResNet, by simply modify the instruction.
 
 .. code:: yaml
 
     task_name: caption
     instruction:
-         - '[IMAGE:image,adaptor=image_vit] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:caption] <EOS>'
+         - '[IMAGE:image,adaptor=image_vit] what does the image describe? -> [TEXT:caption]'
 
@@ -23,7 +23,7 @@ in the scripts directory and fill in the following configuration:
     task:
         image_classify:
             instruction:
-                template: '[IMAGE:image,preprocess=imagenet] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:label_name,closed_set] <EOS>'
+                template: '[IMAGE:image,preprocess=imagenet] what does the image describe? -> [TEXT:label_name,closed_set]'
 
             dataset:
                 num_workers: 8
 
@@ -79,7 +79,7 @@ Image Captioning
 
 .. code:: python
 
-    >>> instruction = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
+    >>> instruction = '[IMAGE:img] what does the image describe? -> [TEXT:cap]'
     >>> data = {'img': "https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/val2014/COCO_val2014_000000222628.jpg"}
     >>> output = model.inference(instruction, data=data)
     >>> print(output.text)
@@ -95,7 +95,7 @@ Visual Grounding
 
 .. code:: python
 
-    >>> instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
+    >>> instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
     >>> data = [
     ...     {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", 'cap': 'hand'},
     ...     {'img': "http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/train2014/COCO_train2014_000000581563.jpg", 'cap': 'taxi'},
@@ -112,7 +112,7 @@ Text Summarization
 
 .. code:: python
 
-    >>> instruction = '<BOS> what is the summary of article " [TEXT:src] "? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+    >>> instruction = 'what is the summary of article " [TEXT:src] "? -> [TEXT:tgt]'
     >>> data = {'src': "poland 's main opposition party tuesday endorsed president lech walesa in an upcoming "
     ...        "presidential run-off election after a reformed communist won the first round of voting ."}
     >>> output = model.inference(instruction, data=data)
@@ -138,7 +138,7 @@ Table-to-Text Generation
 
 .. code:: python
 
-    >>> instruction = '<BOS> structured knowledge: " [STRUCT:database,uncased] "  . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+    >>> instruction = 'structured knowledge: " [STRUCT:database,uncased] " . how to describe the tripleset ? -> [TEXT:tgt]'
     >>> data = {
     ...     'database': [['Atlanta', 'OFFICIAL_POPULATION', '5,457,831'],
     ...                  ['[TABLECONTEXT]', 'METROPOLITAN_AREA', 'Atlanta'],
@@ -152,7 +152,7 @@ Table-to-Text Generation
 
 ::
 
-   "atlanta is the metropolitan area in the united states in 2012."
+   "atlanta, united states has a population of 5,457,831 in 2012."
 
 Text-to-SQL Generation
 ---------------------------
@@ -172,7 +172,7 @@ Text-to-SQL Generation
 
 .. code:: python
 
-    >>> instruction = '<BOS> " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. <EOS> -> <BOS> [TEXT:tgt] <EOS>'
+    >>> instruction = '" [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. -> [TEXT:tgt]'
     >>> database = [
     ...             ['concert_singer'],
     ...             ['stadium', 'stadium_id , location , name , capacity , highest , lowest , average'],
@@ -199,7 +199,7 @@ Video Captioning
 
 .. code:: python
 
-    >>> instruction = '[VIDEO:video] <BOS> what does the video describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
+    >>> instruction = '[VIDEO:video] what does the video describe? -> [TEXT:cap]'
     >>> data = {'video': 'oss://ofasys/datasets/msrvtt_data/videos/video7021.mp4'}
     >>> output = model.inference(instruction, data=data)
     >>> print(output.text)
@@ -215,7 +215,7 @@ Speech-to-Text Generation
 
 .. code:: python
 
-    >>> instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
+    >>> instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
     >>> data = {'wav': 'oss://ofasys/data/librispeech/dev-clean/1272/128104/1272-128104-0001.flac'}
     >>> output = model.inference(instruction, data=data)
     >>> print(output.text)
 
@@ -15,14 +15,14 @@ Default Template
 ^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]
+		[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]
 
 Usage
 ^^^^^^^^^^^^^^^^^^^^
 
 .. code:: python
 
-    >>> instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
+    >>> instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
     >>> data = {'wav': 'oss://ofasys/data/librispeech/dev-clean/1272/128104/1272-128104-0001.flac'}
     >>> output = model.inference(instruction, data=data)
     >>> print(output.text)
 
@@ -21,14 +21,14 @@ Default Template
 ^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[IMAGE:img] which region does the text "[TEXT:cap]" describe? -> [BOX:patch_boxes,add_bos,add_eos]
+		[IMAGE:img] which region does the text "[TEXT:cap]" describe? -> [BOX:patch_boxes]
 
 Usage
 ^^^^^^^^^^^^^^^^^^^^
 
 .. code:: python
 
-    >>> instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
+    >>> instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
     >>> data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", 'cap': 'hand'}
     >>> output = model.inference(instruction, data=data)
     >>> output.save_box('0.jpg')
@@ -61,7 +61,7 @@ Default Template
 ^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[IMAGE:img] <BOS> what does the region describe? region: [BOX:patch_boxes] <EOS> -> <BOS> [TEXT:cap] <EOS>
+		[IMAGE:img] what does the region describe? region: [BOX:patch_boxes] -> [TEXT:cap]
 
 
 .. _od:
@@ -79,5 +79,5 @@ Default Template
 ^^^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[IMAGE:img] <BOS> what are the objects in the image? <EOS> -> <BOS>( [BOX] [TEXT])* <EOS>
+		[IMAGE:img] what are the objects in the image? -> ( [BOX] [TEXT])*
 
@@ -17,15 +17,15 @@ Default Template
 
 .. code-block:: console
 
-	[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>
+	[IMAGE:img] what does the image describe? -> [TEXT:cap]
 
 
 Usage
 ^^^^^^^^^^^^^^^^^^^^
 
 .. code-block::
 
-    >>> template = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
+    >>> template = '[IMAGE:img] what does the image describe? -> [TEXT:cap]'
     >>> data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg"}
     >>> output = model.inference(template, data=data)
     >>> print(output)
@@ -108,7 +108,7 @@ Default Template
 ^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[IMAGE:image] <BOS> [TEXT:question] <EOS> -> <BOS> [TEXT:answer,closed_set] <EOS>
+		[IMAGE:image] [TEXT:question] -> [TEXT:answer,closed_set]
 
 
 .. _snlive:
 
@@ -14,12 +14,71 @@ We now describe how OFASys implements denoising diffusion probabilistic modeling
 
 Here we assume the dataset is a table, where each sample record contains two fields.
 Field "mocap" of modality "MOTION" is a BVH file containing motion capture data,
-while field "title" of modality "TEXT" is a text sentence describing the captured motion, e.g., "a person walks four steps backward".
-Similarly, we can replace "title" with other modalities such as "[AUDIO:...]", "[IMAGE:...]", "[VIDEO:...]" to implement various kinds of conditional synthesis tasks.
+while field "text" of modality "TEXT" is a text sentence describing the captured motion, e.g., "a person walks four steps backward".
+Similarly, we can replace "text" with other modalities such as "[AUDIO:...]", "[IMAGE:...]", "[VIDEO:...]" to implement various kinds of conditional synthesis tasks.
 And we can simply replace the text with an empty string to implement the task of unconditional motion synthesis, aka., motion prediction.
 
 Default Template
 ^^^^^^^^^^^^^^^^
 .. code-block:: console
 
-		[TEXT:title] -> [MOTION:mocap,preprocess=motion_6d,adaptor=motion_6d]
+    motion capture: [TEXT:text] -> [MOTION:bvh_frames,preprocess=motion_6d,adaptor=motion_6d]
+
+Usage
+^^^^^^^^^^^^^^^^^^^^
+
+Prepare the model, instruction, and text prompts for text-to-motion generation.
+
+.. code:: python
+
+    import torch
+    from ofasys import OFASys
+    # This checkpoint is for demonstration purposes only, and does not represent the final quality of any project or product.
+    # The checkpoint is for research only, and commercial use is prohibited.
+    model = OFASys.from_pretrained('http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/model_hub/single_task_motion.pt')
+    if torch.cuda.is_available():
+        model = model.cuda()
+    instruction = 'motion capture: [TEXT:text] -> [MOTION:bvh_frames,preprocess=motion_6d,adaptor=motion_6d]'
+    prompts = [
+        {'text': 'run then jump'},
+        {'text': 'run then jump like dancing'},
+    ]
+
+Example 1: Inference without classifier-free guidance. This usage is much simpler and more concise than the classifier-free guided approach described later (see Example 2). However, the generated results tend to correlate poorly with the text prompts. It is thus *NOT* recommended.
+
+.. code:: python
+
+    output = model.inference(instruction, data=prompts)
+    output[0].save_as_gif('run_then_jump__no_guide.gif')
+
+
+Example 2 (*recommended*): Inference with `classifier-free guidance <https://arxiv.org/abs/2207.12598>`_ enabled.
+It uses an experimental API for negative prompting and classifier-free guidance.
+Classifier-free guidance is implemented by providing the NULL condition (i.e., an empty text) as the negative prompt.
+
+.. code:: python
+
+    guided_prompts = []
+    for p in prompts:
+        guided_prompts.append(p)
+        guided_prompts.append({'text': ''})  # The negative prompt, or an empty string for classifier-free guidance.
+    # This API requires the positive and negative prompts be in the same batch, so assert batch_size % 2 == 0.
+    output = model.inference(instruction, data=guided_prompts, guidance_weight=3.0, batch_size=2)
+    output = output[::2]
+    output[0].save_as_gif('run_then_jump__guided.gif')
+    output[1].save_as_gif('run_then_jump_like_dancing__guided.gif')
+    output[0].save_as_bvh('run_then_jump__guided.bvh')  # Export the result in the BVH format for Blender.
+
+CASE
+^^^^^^^^^^^^^^^^^^
+The saved result "run_then_jump__guided.gif" should look like below:
+
+.. image:: https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_guided.gif
+
+The saved result "run_then_jump_like_dancing__guided.gif" should look like below:
+
+.. image:: https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_like_dancing_guided.gif
+
+The saved result in the `BVH <https://research.cs.wisc.edu/graphics/Courses/cs-838-1999/Jeff/BVH.html>`_ file format, "run_then_jump__guided.bvh", can be imported into a 3D animation software such as `Blender <https://www.blender.org/>`_ for rendering:
+
+.. video:: http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_guided.mp4
@@ -65,7 +65,7 @@ Usage
 
 .. code:: python
 
-    >>> instruction = '<BOS> structured knowledge: " [TEXT:database] "  . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] '
+    >>> instruction = 'structured knowledge: " [TEXT:database] "  . how to describe the tripleset ? -> [TEXT:tgt] '
     >>> data = {'database': database}
     >>> output = model.inference(instruction, data=data)
     >>> print(output.text)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+============`
	`2`	`+trainer(WIP)`
	`3`	`+============`