Skip to content

Commit f3c664b

Browse files
authored
Merge pull request #25 from s-JoL/dev
v2 release
2 parents 92af968 + c890bce commit f3c664b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1185
-1439
lines changed

README.md

+107-72
Large diffs are not rendered by default.

README_en.md

+189-118
Large diffs are not rendered by default.

assets/chinese.JPG

-961 KB
Binary file not shown.

assets/code.JPG

-203 KB
Binary file not shown.

assets/instruct_loss.png

38.8 KB
Loading

assets/paper.JPG

-289 KB
Binary file not shown.

assets/pretrain_loss.png

-10.8 KB
Loading

chat_server.py

+22-13
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,29 @@
22
Author: LiangSong([email protected])
33
Date: 2023-04-06 22:30:10
44
LastEditors: LiangSong([email protected])
5-
LastEditTime: 2023-04-07 23:03:31
5+
LastEditTime: 2023-04-27 20:34:58
66
FilePath: /Open-Llama/chat_server.py
77
Description:
88
99
Copyright (c) 2023 by LiangSong([email protected]), All Rights Reserved.
1010
"""
1111
import torch
1212
import gradio as gr
13-
import sentencepiece as spm
14-
from dataset.tokenizer import Tokenizer
15-
from transformers import LlamaForCausalLM, LlamaConfig
13+
from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer
1614

1715

18-
sp_model = spm.SentencePieceProcessor(
19-
model_file="configs/10w_vocab_wudao5_pile10.model"
16+
tokenizer = LlamaTokenizer(
17+
"configs/10w_vocab_wudao5_pile10.model",
18+
pad_token="<pad>",
19+
add_bos_token=False,
20+
add_eos_token=True,
2021
)
21-
tokenizer = Tokenizer(sp_model)
22-
raw_model = LlamaForCausalLM(
23-
LlamaConfig(
22+
23+
raw_model = OpenLlamaForCausalLM(
24+
OpenLlamaConfig(
2425
vocab_size=tokenizer.vocab_size,
2526
initializer_range=0.01,
26-
pad_token_id=tokenizer.pad_id,
27+
pad_token_id=tokenizer.pad_token_id,
2728
rms_norm_eps=1e-5,
2829
hidden_dropout_prob=0.1,
2930
attention_dropout_prob=0.1,
@@ -80,20 +81,28 @@ def bot(history):
8081
if completion is None:
8182
inputs = "user:{}\nsystem:".format(prompt)
8283
inputs = tokenizer(
83-
inputs, return_tensors=True, add_special_tokens=False
84+
inputs,
85+
return_tensors="pt",
86+
add_special_tokens=False,
87+
return_attention_mask=False,
8488
)
8589
context.append(inputs["input_ids"])
8690
else:
8791
inputs = "user:{}\nsystem:{}".format(prompt, completion)
88-
inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=True)
92+
inputs = tokenizer(
93+
inputs,
94+
return_tensors="pt",
95+
add_special_tokens=True,
96+
return_attention_mask=False,
97+
)
8998
context.append(inputs["input_ids"])
9099
context = torch.cat(context, dim=-1)
91100
context = context[:, -1024:]
92101
inputs_len = context.shape[1]
93102
context = context.cuda()
94103
pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True)
95104
pred = pred[:, inputs_len:]
96-
pred = tokenizer.decode(pred.cpu())[0]
105+
pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
97106
print(pred)
98107
bot_message = parse_codeblock(pred)
99108
history[-1][1] = bot_message

configs/4w_cn_vocab_wudao15.model

845 KB
Binary file not shown.

configs/6w_vocab_wudao5_pile10.model

-1.23 MB
Binary file not shown.

configs/default_config.yaml

+1-13
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,18 @@
11
compute_environment: LOCAL_MACHINE
22
deepspeed_config:
33
deepspeed_multinode_launcher: standard
4-
gradient_accumulation_steps: 12
54
gradient_clipping: 1.0
65
offload_optimizer_device: none
76
offload_param_device: none
87
zero3_init_flag: false
98
zero_stage: 1
109
distributed_type: DEEPSPEED
11-
downcast_bf16: 'no'
12-
dynamo_backend: 'no'
13-
# dynamo_config:
14-
# dynamo_backend: INDUCTOR
15-
# dynamo_mode: default
16-
# dynamo_use_dynamic: true
17-
# dynamo_use_fullgraph: false
1810
fsdp_config: {}
1911
machine_rank: 0
2012
main_training_function: main
21-
megatron_lm_config: {}
2213
mixed_precision: bf16
2314
num_machines: 1
2415
num_processes: 8
2516
rdzv_backend: static
2617
same_network: true
27-
tpu_env: []
28-
tpu_use_cluster: false
29-
tpu_use_sudo: false
30-
use_cpu: false
18+
use_cpu: false

configs/instruct_config.yaml

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
data:
2+
mode: "instruct"
3+
data:
4+
mixed: "data/instruction_data/part-*.jsonl.zst"
5+
pad_to_max: False
6+
sequence_sample_mode: "none"
7+
concat_multiple_sequence: True
8+
num_sequences: 50
9+
seq_length: 2048
10+
tokenizer_model_path: "configs/llama_tokenizer_extended.model"
11+
model:
12+
initializer_range: 1.0e-2
13+
hidden_dropout_prob: 0.1
14+
attention_dropout_prob: 0.1
15+
use_stable_embedding: False
16+
shared_input_output_embedding: False
17+
train:
18+
train_batch_size: 2
19+
num_training_steps: 1000000
20+
num_warmup_steps: 2000
21+
initializer_range: 1.0e-2
22+
lr: 2.0e-4
23+
weight_decay: 1.0e-1
24+
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
25+
train_num_workers: 16
26+
gradient_accumulation_steps: 1
27+
prefetch_factor: 100
28+
# global step
29+
log_interval: 50
30+
eval_interval: 500
31+
save_interval: 1000
32+
work_dir: "data/saved_ckpt/7B"
33+
project_name: "Llama Instruction"

configs/instruction_tuning_config.py

-25
This file was deleted.
1.04 MB
Binary file not shown.

configs/pretrain_config.py

-14
This file was deleted.

configs/pretrain_config.yaml

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
data:
2+
mode: "pretrain"
3+
data:
4+
wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
5+
# 由于加载了Llama模型的ckpt所以只使用少量英文数据
6+
the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
7+
pad_to_max: False
8+
sequence_sample_mode: "none"
9+
concat_multiple_sequence: True
10+
num_sequences: 10
11+
seq_length: 2048
12+
tokenizer_model_path: "configs/llama_tokenizer_extended.model"
13+
model:
14+
initializer_range: 1.0e-2
15+
hidden_dropout_prob: 0.1
16+
attention_dropout_prob: 0.1
17+
use_stable_embedding: False
18+
shared_input_output_embedding: False
19+
train:
20+
train_batch_size: 2
21+
num_training_steps: 500000
22+
num_warmup_steps: 2000
23+
initializer_range: 1.0e-2
24+
lr: 2.0e-4
25+
weight_decay: 1.0e-1
26+
# 加载预训练权重,从头训练设为null
27+
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
28+
train_num_workers: 16
29+
gradient_accumulation_steps: 12
30+
prefetch_factor: 100
31+
# global step
32+
log_interval: 5
33+
eval_interval: 500
34+
save_interval: 1000
35+
work_dir: "data/saved_ckpt/7B"
36+
project_name: "Llama Pretrain"

data/preprocess_instruction.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
dataset = load_dataset("yizhongw/self_instruct")
1919
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
2020
total_num = 0
21-
file_num = 0
21+
file_num = 1
2222
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
2323
for line in dataset["train"]:
2424
line = json.dumps(line)
@@ -39,7 +39,7 @@
3939
dataset = load_dataset("BelleGroup/train_0.5M_CN")
4040
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
4141
total_num = 0
42-
file_num = 0
42+
file_num = 1
4343
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
4444
for line in dataset["train"]:
4545
line = json.dumps(line)
@@ -60,7 +60,7 @@
6060
dataset = load_dataset("BelleGroup/train_1M_CN")
6161
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
6262
total_num = 0
63-
file_num = 0
63+
file_num = 1
6464
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
6565
for line in dataset["train"]:
6666
line = json.dumps(line)
@@ -81,7 +81,7 @@
8181
dataset = load_dataset("BelleGroup/school_math_0.25M")
8282
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
8383
total_num = 0
84-
file_num = 0
84+
file_num = 1
8585
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
8686
for line in dataset["train"]:
8787
line = json.dumps(line)
@@ -102,7 +102,7 @@
102102
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
103103
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
104104
total_num = 0
105-
file_num = 0
105+
file_num = 1
106106
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
107107
for line in dataset["train"]:
108108
line = json.dumps(line)
@@ -123,7 +123,7 @@
123123
dataset = load_dataset("Graverman/Instruct-to-Code")
124124
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
125125
total_num = 0
126-
file_num = 0
126+
file_num = 1
127127
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
128128
for line in dataset["train"]:
129129
line = json.dumps(line)
@@ -143,7 +143,7 @@
143143

144144
write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
145145
total_num = 0
146-
file_num = 0
146+
file_num = 1
147147
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
148148
with open("data/sg_90k_part1.json", "r") as fp:
149149
data1 = json.load(fp)

data/preprocess_the_pile.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
paths = glob("data/the_pile/*.jsonl.zst")
1818
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
1919
total_num = 0
20-
file_num = 0
20+
file_num = 1
2121
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
2222
for path in tqdm(paths, total=len(paths)):
2323
with zstd.open(path, "r", encoding="utf-8") as fp:

data/preprocess_wudao.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
1818
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
1919
total_num = 0
20-
file_num = 0
20+
file_num = 1
2121
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
2222
for path in tqdm(paths, total=len(paths)):
2323
with open(path, "r") as fp:

dataset/collate_fn.py

-69
This file was deleted.

0 commit comments

Comments
 (0)