Skip to content

Commit 2ff1803

Browse files
committed
v0.1.0-patch1
1 parent cd5cb0a commit 2ff1803

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+3147
-2278
lines changed

README.md

+11-11
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ OFASys enables multi-task multi-modal inference through the instruction alone. T
141141
<img src="https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/val2014/COCO_val2014_000000222628.jpg" width="400">
142142

143143
```python
144-
instruction = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
144+
instruction = '[IMAGE:img] what does the image describe? -> [TEXT:cap]'
145145
data = {'img': "./COCO_val2014_000000222628.jpg"}
146146
output = model.inference(instruction, data=data)
147147
print(output.text)
@@ -152,7 +152,7 @@ print(output.text)
152152
<img src="https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg" width="400">
153153

154154
```python
155-
instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
155+
instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
156156
data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", "cap": "hand"}
157157
output = model.inference(instruction, data=data)
158158
output.save_box("output.jpg")
@@ -162,7 +162,7 @@ output.save_box("output.jpg")
162162
### Text Summarization
163163

164164
```python
165-
instruction = '<BOS> what is the summary of article " [TEXT:src] "? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
165+
instruction = 'what is the summary of article " [TEXT:src] "? -> [TEXT:tgt]'
166166
data = {'src': "poland 's main opposition party tuesday endorsed president lech walesa in an upcoming "
167167
"presidential run-off election after a reformed communist won the first round of voting ."}
168168
output = model.inference(instruction, data=data)
@@ -173,7 +173,7 @@ print(output.text)
173173
### Table-to-Text Generation
174174

175175
```python
176-
instruction = '<BOS> structured knowledge: " [STRUCT:database,uncased] " . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
176+
instruction = 'structured knowledge: " [STRUCT:database,uncased] " . how to describe the tripleset ? -> [TEXT:tgt]'
177177
data = {
178178
'database': [['Atlanta', 'OFFICIAL_POPULATION', '5,457,831'],
179179
['[TABLECONTEXT]', 'METROPOLITAN_AREA', 'Atlanta'],
@@ -184,13 +184,13 @@ data = {
184184
}
185185
output = model.inference(instruction, data=data, beam_size=1)
186186
print(output.text)
187-
# "atlanta is the metropolitan area in the united states in 2012."
187+
# "atlanta, united states has a population of 5,457,831 in 2012."
188188
```
189189

190190
### Text-to-SQL Generation
191191

192192
```python
193-
instruction = '<BOS> " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. <EOS> -> <BOS> [TEXT:tgt] <EOS>'
193+
instruction = ' " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. -> [TEXT:tgt]'
194194
database = [
195195
['concert_singer'],
196196
['stadium', 'stadium_id , location , name , capacity , highest , lowest , average'],
@@ -201,21 +201,21 @@ database = [
201201
data = [
202202
{'src': 'What are the names, countries, and ages for every singer in descending order of age?', 'database': database},
203203
{'src': 'What are all distinct countries where singers above age 20 are from?', 'database': database},
204-
{'src': 'What are the locations and names of all stations with capacity between 5000 and 10000?', 'database': database}
204+
{'src': 'Show the name and the release year of the song by the youngest singer.', 'database': database}
205205
]
206206
output = model.inference(instruction, data=data)
207-
print('\n'.join([o.text for o in output]))
207+
print('\n'.join(o.text for o in output))]
208208
# "select name, country, age from singer order by age desc"
209209
# "select distinct country from singer where age > 20"
210-
# "select location, name from stadium where capacity between 5000 and 10000"
210+
# "select song_name, song_release_year from singer order by age limit 1"
211211
```
212212

213213
### Video Captioning
214214

215215
<img src="https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/video.png" width="400">
216216

217217
```python
218-
instruction = '[VIDEO:video] <BOS> what does the video describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
218+
instruction = '[VIDEO:video] what does the video describe? -> [TEXT:cap]'
219219
data = {'video': './video7021.mp4'}
220220
output = model.inference(instruction, data=data)
221221
print(output.text)
@@ -230,7 +230,7 @@ print(output.text)
230230
</audio>
231231

232232
```python
233-
instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
233+
instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
234234
data = {'wav': './1272-128104-0001.flac'}
235235
output = model.inference(instruction, data=data)
236236
print(output.text)

docs/app.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import logging
2+
3+
from flask import Flask
4+
from flask_cors import *
5+
6+
app = Flask(__name__)
7+
CORS(app, resources=r'/*', supports_credentials=True)
8+
9+
app = Flask(__name__, static_url_path='/', static_folder='build/html/')
10+
11+
12+
@app.route('/')
13+
@app.route('/<path:path>')
14+
def serve_sphinx_docs(path='index.html'):
15+
return app.send_static_file(path)
16+
17+
18+
if __name__ != "__main__":
19+
gunicorn_logger = logging.getLogger("gunicorn.error")
20+
app.logger.handlers = gunicorn_logger.handlers
21+
app.logger.setLevel(gunicorn_logger.level)

docs/requirements_doc.txt

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ sphinx
22
furo
33
sphinx_rtd_theme
44
sphinx-argparse
5+
flask
6+
flask_cors
57
sphinxcontrib-video
68
furo
79
sphinx-copybutton

docs/source/api/trainer.rst

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
============
2+
trainer(WIP)
3+
============

docs/source/howto/add_module.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,13 @@ The original content of ``caption.yaml`` is:
193193
194194
task_name: caption
195195
instruction:
196-
- '[IMAGE:image] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:caption] <EOS>'
196+
- '[IMAGE:image] what does the image describe? -> [TEXT:caption]'
197197
198198
We can change to use ViT as the image adaptor instead of ResNet, by simply modify the instruction.
199199

200200
.. code:: yaml
201201
202202
task_name: caption
203203
instruction:
204-
- '[IMAGE:image,adaptor=image_vit] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:caption] <EOS>'
204+
- '[IMAGE:image,adaptor=image_vit] what does the image describe? -> [TEXT:caption]'
205205

docs/source/howto/train.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ in the scripts directory and fill in the following configuration:
2323
task:
2424
image_classify:
2525
instruction:
26-
template: '[IMAGE:image,preprocess=imagenet] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:label_name,closed_set] <EOS>'
26+
template: '[IMAGE:image,preprocess=imagenet] what does the image describe? -> [TEXT:label_name,closed_set]'
2727
2828
dataset:
2929
num_workers: 8

docs/source/start/quickstart.rst

+8-8
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ Image Captioning
7979

8080
.. code:: python
8181
82-
>>> instruction = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
82+
>>> instruction = '[IMAGE:img] what does the image describe? -> [TEXT:cap]'
8383
>>> data = {'img': "https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/val2014/COCO_val2014_000000222628.jpg"}
8484
>>> output = model.inference(instruction, data=data)
8585
>>> print(output.text)
@@ -95,7 +95,7 @@ Visual Grounding
9595

9696
.. code:: python
9797
98-
>>> instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
98+
>>> instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
9999
>>> data = [
100100
... {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", 'cap': 'hand'},
101101
... {'img': "http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/data/coco/2014/train2014/COCO_train2014_000000581563.jpg", 'cap': 'taxi'},
@@ -112,7 +112,7 @@ Text Summarization
112112

113113
.. code:: python
114114
115-
>>> instruction = '<BOS> what is the summary of article " [TEXT:src] "? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
115+
>>> instruction = 'what is the summary of article " [TEXT:src] "? -> [TEXT:tgt]'
116116
>>> data = {'src': "poland 's main opposition party tuesday endorsed president lech walesa in an upcoming "
117117
... "presidential run-off election after a reformed communist won the first round of voting ."}
118118
>>> output = model.inference(instruction, data=data)
@@ -138,7 +138,7 @@ Table-to-Text Generation
138138

139139
.. code:: python
140140
141-
>>> instruction = '<BOS> structured knowledge: " [STRUCT:database,uncased] " . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] <EOS>'
141+
>>> instruction = 'structured knowledge: " [STRUCT:database,uncased] " . how to describe the tripleset ? -> [TEXT:tgt]'
142142
>>> data = {
143143
... 'database': [['Atlanta', 'OFFICIAL_POPULATION', '5,457,831'],
144144
... ['[TABLECONTEXT]', 'METROPOLITAN_AREA', 'Atlanta'],
@@ -152,7 +152,7 @@ Table-to-Text Generation
152152
153153
::
154154

155-
"atlanta is the metropolitan area in the united states in 2012."
155+
"atlanta, united states has a population of 5,457,831 in 2012."
156156

157157
Text-to-SQL Generation
158158
---------------------------
@@ -172,7 +172,7 @@ Text-to-SQL Generation
172172

173173
.. code:: python
174174
175-
>>> instruction = '<BOS> " [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. <EOS> -> <BOS> [TEXT:tgt] <EOS>'
175+
>>> instruction = '" [TEXT:src] " ; structured knowledge: " [STRUCT:database,max_length=876] " . generating sql code. -> [TEXT:tgt]'
176176
>>> database = [
177177
... ['concert_singer'],
178178
... ['stadium', 'stadium_id , location , name , capacity , highest , lowest , average'],
@@ -199,7 +199,7 @@ Video Captioning
199199

200200
.. code:: python
201201
202-
>>> instruction = '[VIDEO:video] <BOS> what does the video describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
202+
>>> instruction = '[VIDEO:video] what does the video describe? -> [TEXT:cap]'
203203
>>> data = {'video': 'oss://ofasys/datasets/msrvtt_data/videos/video7021.mp4'}
204204
>>> output = model.inference(instruction, data=data)
205205
>>> print(output.text)
@@ -215,7 +215,7 @@ Speech-to-Text Generation
215215

216216
.. code:: python
217217
218-
>>> instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
218+
>>> instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
219219
>>> data = {'wav': 'oss://ofasys/data/librispeech/dev-clean/1272/128104/1272-128104-0001.flac'}
220220
>>> output = model.inference(instruction, data=data)
221221
>>> print(output.text)

docs/source/task/audio.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ Default Template
1515
^^^^^^^^^^^^^^^^
1616
.. code-block:: console
1717
18-
[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]
18+
[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]
1919
2020
Usage
2121
^^^^^^^^^^^^^^^^^^^^
2222

2323
.. code:: python
2424
25-
>>> instruction = '[AUDIO:wav] <BOS> what is the text corresponding to the voice? <EOS> -> [TEXT:text,preprocess=text_phone,add_bos,add_eos]'
25+
>>> instruction = '[AUDIO:wav] what is the text corresponding to the voice? -> [TEXT:text,preprocess=text_phone]'
2626
>>> data = {'wav': 'oss://ofasys/data/librispeech/dev-clean/1272/128104/1272-128104-0001.flac'}
2727
>>> output = model.inference(instruction, data=data)
2828
>>> print(output.text)

docs/source/task/box.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ Default Template
2121
^^^^^^^^^^^^^^^^
2222
.. code-block:: console
2323
24-
[IMAGE:img] which region does the text "[TEXT:cap]" describe? -> [BOX:patch_boxes,add_bos,add_eos]
24+
[IMAGE:img] which region does the text "[TEXT:cap]" describe? -> [BOX:patch_boxes]
2525
2626
Usage
2727
^^^^^^^^^^^^^^^^^^^^
2828

2929
.. code:: python
3030
31-
>>> instruction = '[IMAGE:img] <BOS> which region does the text " [TEXT:cap] " describe? <EOS> -> [BOX:patch_boxes,add_bos,add_eos]'
31+
>>> instruction = '[IMAGE:img] which region does the text " [TEXT:cap] " describe? -> [BOX:patch_boxes]'
3232
>>> data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg", 'cap': 'hand'}
3333
>>> output = model.inference(instruction, data=data)
3434
>>> output.save_box('0.jpg')
@@ -61,7 +61,7 @@ Default Template
6161
^^^^^^^^^^^^^^^^
6262
.. code-block:: console
6363
64-
[IMAGE:img] <BOS> what does the region describe? region: [BOX:patch_boxes] <EOS> -> <BOS> [TEXT:cap] <EOS>
64+
[IMAGE:img] what does the region describe? region: [BOX:patch_boxes] -> [TEXT:cap]
6565
6666
6767
.. _od:
@@ -79,5 +79,5 @@ Default Template
7979
^^^^^^^^^^^^^^^^^^
8080
.. code-block:: console
8181
82-
[IMAGE:img] <BOS> what are the objects in the image? <EOS> -> <BOS>( [BOX] [TEXT])* <EOS>
82+
[IMAGE:img] what are the objects in the image? -> ( [BOX] [TEXT])*
8383

docs/source/task/image.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ Default Template
1717

1818
.. code-block:: console
1919
20-
[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>
20+
[IMAGE:img] what does the image describe? -> [TEXT:cap]
2121
2222
2323
Usage
2424
^^^^^^^^^^^^^^^^^^^^
2525

2626
.. code-block::
2727
28-
>>> template = '[IMAGE:img] <BOS> what does the image describe? <EOS> -> <BOS> [TEXT:cap] <EOS>'
28+
>>> template = '[IMAGE:img] what does the image describe? -> [TEXT:cap]'
2929
>>> data = {'img': "https://www.2008php.com/2014_Website_appreciate/2015-06-22/20150622131649.jpg"}
3030
>>> output = model.inference(template, data=data)
3131
>>> print(output)
@@ -108,7 +108,7 @@ Default Template
108108
^^^^^^^^^^^^^^^^
109109
.. code-block:: console
110110
111-
[IMAGE:image] <BOS> [TEXT:question] <EOS> -> <BOS> [TEXT:answer,closed_set] <EOS>
111+
[IMAGE:image] [TEXT:question] -> [TEXT:answer,closed_set]
112112
113113
114114
.. _snlive:

docs/source/task/motion.rst

+62-3
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,71 @@ We now describe how OFASys implements denoising diffusion probabilistic modeling
1414

1515
Here we assume the dataset is a table, where each sample record contains two fields.
1616
Field "mocap" of modality "MOTION" is a BVH file containing motion capture data,
17-
while field "title" of modality "TEXT" is a text sentence describing the captured motion, e.g., "a person walks four steps backward".
18-
Similarly, we can replace "title" with other modalities such as "[AUDIO:...]", "[IMAGE:...]", "[VIDEO:...]" to implement various kinds of conditional synthesis tasks.
17+
while field "text" of modality "TEXT" is a text sentence describing the captured motion, e.g., "a person walks four steps backward".
18+
Similarly, we can replace "text" with other modalities such as "[AUDIO:...]", "[IMAGE:...]", "[VIDEO:...]" to implement various kinds of conditional synthesis tasks.
1919
And we can simply replace the text with an empty string to implement the task of unconditional motion synthesis, aka., motion prediction.
2020

2121
Default Template
2222
^^^^^^^^^^^^^^^^
2323
.. code-block:: console
2424
25-
[TEXT:title] -> [MOTION:mocap,preprocess=motion_6d,adaptor=motion_6d]
25+
motion capture: [TEXT:text] -> [MOTION:bvh_frames,preprocess=motion_6d,adaptor=motion_6d]
26+
27+
Usage
28+
^^^^^^^^^^^^^^^^^^^^
29+
30+
Prepare the model, instruction, and text prompts for text-to-motion generation.
31+
32+
.. code:: python
33+
34+
import torch
35+
from ofasys import OFASys
36+
# This checkpoint is for demonstration purposes only, and does not represent the final quality of any project or product.
37+
# The checkpoint is for research only, and commercial use is prohibited.
38+
model = OFASys.from_pretrained('http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/model_hub/single_task_motion.pt')
39+
if torch.cuda.is_available():
40+
model = model.cuda()
41+
instruction = 'motion capture: [TEXT:text] -> [MOTION:bvh_frames,preprocess=motion_6d,adaptor=motion_6d]'
42+
prompts = [
43+
{'text': 'run then jump'},
44+
{'text': 'run then jump like dancing'},
45+
]
46+
47+
Example 1: Inference without classifier-free guidance. This usage is much simpler and more concise than the classifier-free guided approach described later (see Example 2). However, the generated results tend to correlate poorly with the text prompts. It is thus *NOT* recommended.
48+
49+
.. code:: python
50+
51+
output = model.inference(instruction, data=prompts)
52+
output[0].save_as_gif('run_then_jump__no_guide.gif')
53+
54+
55+
Example 2 (*recommended*): Inference with `classifier-free guidance <https://arxiv.org/abs/2207.12598>`_ enabled.
56+
It uses an experimental API for negative prompting and classifier-free guidance.
57+
Classifier-free guidance is implemented by providing the NULL condition (i.e., an empty text) as the negative prompt.
58+
59+
.. code:: python
60+
61+
guided_prompts = []
62+
for p in prompts:
63+
guided_prompts.append(p)
64+
guided_prompts.append({'text': ''}) # The negative prompt, or an empty string for classifier-free guidance.
65+
# This API requires the positive and negative prompts be in the same batch, so assert batch_size % 2 == 0.
66+
output = model.inference(instruction, data=guided_prompts, guidance_weight=3.0, batch_size=2)
67+
output = output[::2]
68+
output[0].save_as_gif('run_then_jump__guided.gif')
69+
output[1].save_as_gif('run_then_jump_like_dancing__guided.gif')
70+
output[0].save_as_bvh('run_then_jump__guided.bvh') # Export the result in the BVH format for Blender.
71+
72+
CASE
73+
^^^^^^^^^^^^^^^^^^
74+
The saved result "run_then_jump__guided.gif" should look like below:
75+
76+
.. image:: https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_guided.gif
77+
78+
The saved result "run_then_jump_like_dancing__guided.gif" should look like below:
79+
80+
.. image:: https://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_like_dancing_guided.gif
81+
82+
The saved result in the `BVH <https://research.cs.wisc.edu/graphics/Courses/cs-838-1999/Jeff/BVH.html>`_ file format, "run_then_jump__guided.bvh", can be imported into a 3D animation software such as `Blender <https://www.blender.org/>`_ for rendering:
83+
84+
.. video:: http://ofasys.oss-cn-zhangjiakou.aliyuncs.com/examples/run_then_jump_guided.mp4

docs/source/task/structural.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Usage
6565

6666
.. code:: python
6767
68-
>>> instruction = '<BOS> structured knowledge: " [TEXT:database] " . how to describe the tripleset ? <EOS> -> <BOS> [TEXT:tgt] '
68+
>>> instruction = 'structured knowledge: " [TEXT:database] " . how to describe the tripleset ? -> [TEXT:tgt] '
6969
>>> data = {'database': database}
7070
>>> output = model.inference(instruction, data=data)
7171
>>> print(output.text)

0 commit comments

Comments
 (0)