PallasBot
diff --git a/‎.gitmodules
-3 b/‎.gitmodules
-3
diff --git a/‎docs/AIDeployment.md
+5-10 b/‎docs/AIDeployment.md
+5-10
diff --git a/‎src/plugins/chat/__init__.py
+6-6 b/‎src/plugins/chat/__init__.py
+6-6
diff --git a/‎src/plugins/chat/model.py
+50-222 b/‎src/plugins/chat/model.py
+50-222
@@ -2,6 +2,3 @@
 	path = src/plugins/sing/so_vits_svc
 	url = https://github.com/MistEO/so-vits-svc.git
 	branch = 4.0_pallas
-[submodule "src/plugins/chat/ChatRWKV"]
-	path = src/plugins/chat/ChatRWKV
-	url = https://github.com/MistEO/ChatRWKV.git
@@ -40,26 +40,21 @@
 
 ## Chat
 
-1. 下载模型，参考 [原仓库说明](https://github.com/BlinkDL/ChatRWKV#%E4%B8%AD%E6%96%87%E6%A8%A1%E5%9E%8B)，把文件放到 `resource/chat/models` 文件夹（只要是 `.pth` 都行，根据你的显存和需求选择）
-2. 更新 git 子模块
-
-    ```
-    git submodule update --init --recursive
-    ```
+1. 下载模型，参考 [原仓库说明](https://github.com/BlinkDL/ChatRWKV#%E4%B8%AD%E6%96%87%E6%A8%A1%E5%9E%8B)；下载 [token 文件](https://github.com/BlinkDL/ChatRWKV/blob/main/20B_tokenizer.json)。都放到 `resource/chat/models` 文件夹（模型只要是 `.pth` 都行，根据你的显存和需求选择）
 
-3. 安装依赖
+2. 安装依赖
 
     - CPU
 
     ```bash
-    python -m pip install torch torchvision torchaudio tokenizers
+    python -m pip install torch tokenizers rwkv
     ```
 
     - GPU
     
     ```bash
-    python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
-    python -m pip install tokenizers
+    python -m pip install torch --extra-index-url https://download.pytorch.org/whl/cu117
+    python -m pip install tokenizers rwkv
     ```
 
 4. `src/plugins/chat/model.py` 里的起手咒语 `init_prompt` 有兴趣可以试着改改
 
@@ -6,7 +6,7 @@
 from nonebot import on_message, get_driver, logger
 import random
 
-from .model import answer, del_all_stat
+from .model import chat, del_session
 from src.common.config import BotConfig, GroupConfig
 try:
     from src.common.utils.speech.text_to_speech import text_2_speech
@@ -15,14 +15,14 @@
     print('TTS not available, error:', error)
     TTS_AVAIABLE = False
 
-TTS_PROBABILITY = 0.3
+TTS_MIN_LENGTH = 10
 
 
 def on_sober_up(bot_id, group_id, drunkenness) -> bool:
     session = f'{bot_id}_{group_id}'
     logger.info(
         f'bot [{bot_id}] sober up in group [{group_id}], clear session [{session}]')
-    del_all_stat(session)
+    del_session(session)
 
 
 BotConfig.register_sober_up(on_sober_up)
@@ -56,16 +56,16 @@ async def _(bot: Bot, event: GroupMessageEvent, state: T_State):
     session = f'{event.self_id}_{event.group_id}'
     if text.startswith('牛牛'):
         text = text[2:].strip()
-    ans = await asyncify(answer)(session, text)
+    ans = await asyncify(chat)(session, text)
 
     config.reset_cooldown(cd_key)
     if not ans:
         return
 
     logger.info(f'session [{session}]: {text} -> {ans}')
 
-    if TTS_AVAIABLE and random.random() < TTS_PROBABILITY:
-        bs = await asyncify(text_2_speech)(ans[:50], 1.0)
+    if TTS_AVAIABLE and len(ans) >= TTS_MIN_LENGTH:
+        bs = await asyncify(text_2_speech)(bs)
         msg = MessageSegment.record(bs)
     else:
         msg = ans
 
@@ -1,250 +1,78 @@
-import torch
-from .ChatRWKV.src.model_run import RWKV_RNN
-from .ChatRWKV.src.utils import TOKENIZER
-import os
-import copy
-import types
-import gc
-import numpy as np
 from pathlib import Path
 from threading import Lock
+from copy import deepcopy
+import os
+import torch
 
-np.set_printoptions(precision=4, suppress=True, linewidth=200)
-args = types.SimpleNamespace()
-
-print('\n\nChatRWKV project: https://github.com/BlinkDL/ChatRWKV')
-
-########################################################################################################
+os.environ['RWKV_JIT_ON'] = '1'
+os.environ["RWKV_CUDA_ON"] = '0'
 
-args.RUN_DEVICE = "cuda"  # cuda // cpu
-# fp16 (good for GPU, does NOT support CPU) // fp32 (good for CPU) // bf16 (worse accuracy, supports CPU)
-args.FLOAT_MODE = "fp16"
+from .prompt import INIT_PROMPT, CHAT_FORMAT
+from .pipeline import PIPELINE, PIPELINE_ARGS
+from rwkv.model import RWKV  # pip install rwkv
 
-QA_PROMPT = False  # True: Q & A prompt // False: User & Bot prompt
-# 中文问答设置QA_PROMPT=True（只能问答，问答效果更好，但不能闲聊） 中文聊天设置QA_PROMPT=False（可以闲聊，但需要大模型才适合闲聊）
 
-# Download RWKV-4 models from https://huggingface.co/BlinkDL (don't use Instruct-test models unless you use their prompt templates)
+MODEL_DIR = Path('resource/chat/models')
+TOKEN_PATH = MODEL_DIR / '20B_tokenizer.json'
+STRATEGY = 'cuda fp16'
 
-MODELS = 'resource/chat/models'
-MODEL_FORMAT = '.pth'
-for f in os.listdir(MODELS):
-    if not f.endswith(MODEL_FORMAT):
+MODEL_EXT = '.pth'
+MODEL_PATH = None
+for f in MODEL_DIR.glob('*'):
+    if f.suffix != MODEL_EXT:
         continue
-    f = f[:-len(MODEL_FORMAT)]
-    args.MODEL_NAME = f'{MODELS}/{f}'
-    if 'ctx2048' in f:
-        args.ctx_len = 2048
-    else:
-        args.ctx_len = 1024
+    MODEL_PATH = f.with_suffix('')
     break
 
-if not args.MODEL_NAME:
-    print('!!!Chat model not found!!!')
-    raise Exception('Chat model not found')
-
-CHAT_LEN_SHORT = 40
-CHAT_LEN_LONG = 150
-FREE_GEN_LEN = 200
-
-GEN_TEMP = 1.0
-GEN_TOP_P = 0.85
-
-AVOID_REPEAT = '，。：？！'
-
-########################################################################################################
-
-os.environ["RWKV_RUN_DEVICE"] = args.RUN_DEVICE
-
-
-print(
-    f'\nLoading ChatRWKV - "Chinese" - {args.RUN_DEVICE} - {args.FLOAT_MODE} - QA_PROMPT {QA_PROMPT}')
-
-
-# please tune these (test True/False for all of them). can significantly improve speed.
-# torch._C._jit_set_profiling_executor(True)
-# torch._C._jit_set_profiling_mode(True)
-# torch._C._jit_override_can_fuse_on_cpu(True)
-# torch._C._jit_override_can_fuse_on_gpu(True)
-# torch._C._jit_set_texpr_fuser_enabled(False)
-# torch._C._jit_set_nvfuser_enabled(False)
-
-torch.backends.cudnn.benchmark = True
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cuda.matmul.allow_tf32 = True
-tokenizer = TOKENIZER(
-    str(Path(__file__).parent / "ChatRWKV" / "20B_tokenizer.json"))
-
-args.vocab_size = 50277
-args.head_qk = 0
-args.pre_ffn = 0
-args.grad_cp = 0
-args.my_pos_emb = 0
-MODEL_NAME = args.MODEL_NAME
-
-
-# Load Model
-
-print(f'Loading model - {MODEL_NAME}')
-model = RWKV_RNN(args)
-
-model_tokens = []
-model_state = None
-
-AVOID_REPEAT_TOKENS = []
-for i in AVOID_REPEAT:
-    dd = tokenizer.encode(i)
-    assert len(dd) == 1
-    AVOID_REPEAT_TOKENS += dd
-
-########################################################################################################
-
-user = "博士"
-bot = "牛牛"
-interface = ": "
-
-init_prompt = f'''
-{bot}真名叫帕拉斯，年轻女性，曾经生活在米诺斯，现在在罗德岛工作，{bot}的头上有一朵花、一对角，喜欢喝酒、蹦迪、逛庆典。
-{user}是罗德岛的领袖阿米娅的顾问，而罗德岛是一个医疗组织，他们的目标是拯救矿石病感染者
-以下是{bot}与{user}的对话：
-
-{user}{interface}你是谁？
-
-{bot}{interface}我是帕拉斯，也可以叫我牛牛
+print('Chat model:', MODEL_PATH)
 
-{user}{interface}我是谁？
-
-{bot}{interface}你是博士呀
-
-{user}{interface}你喜欢喝酒吗？
-
-{bot}{interface}喜欢，要不要来一杯？
-
-{user}{interface}你好笨
-
-{bot}{interface}这对角可能会不小心撞倒些家具，我会尽量小心。
-
-'''
-
-
-def run_rnn(tokens, newline_adj=0):
-    global model_tokens, model_state
-
-    tokens = [int(x) for x in tokens]
-    model_tokens += tokens
-    out, model_state = model.forward(tokens, model_state)
+if not MODEL_PATH:
+    print(f'!!!!!!Chat model not found, please put it in {MODEL_DIR}!!!!!!')
+    print(f'!!!!!!Chat 模型不存在，请放到 {MODEL_DIR} 文件夹下!!!!!!')
+    raise Exception('Chat model not found')
 
-    # print(f'### model ###\n{tokens}\n[{tokenizer.decode(model_tokens)}]')
+if not TOKEN_PATH.exists():
+    print(f'AI Chat updated, please put token file to {TOKEN_PATH}, download: https://github.com/BlinkDL/ChatRWKV/blob/main/20B_tokenizer.json')
+    print(f'牛牛的 AI Chat 版本更新了，把 token 文件放到 {TOKEN_PATH} 里再启动, 下载地址：https://github.com/BlinkDL/ChatRWKV/blob/main/20B_tokenizer.json')
+    raise Exception('Chat token not found')
 
-    out[0] = -999999999  # disable <|endoftext|>
-    out[187] += newline_adj  # adjust \n probability
-    # if newline_adj > 0:
-    #     out[15] += newline_adj / 2 # '.'
-    if model_tokens[-1] in AVOID_REPEAT_TOKENS:
-        out[model_tokens[-1]] = -999999999
-    return out
+torch.cuda.empty_cache()
+model = RWKV(model=str(MODEL_PATH), strategy=STRATEGY)
+pipeline = PIPELINE(model, str(TOKEN_PATH))
+args = PIPELINE_ARGS(temperature=1.0, top_p=0.7,
+                     alpha_frequency=0.25,
+                     alpha_presence=0.25,
+                     token_ban=[0],  # ban the generation of some tokens
+                     token_stop=[187])  # stop generation whenever you see any token here
 
 
+CHAT_INIT = "CHAT_INIT"
 all_state = {}
-INIT_SESSION = 'chat_init'
-
-
-def save_all_stat(session: str, last_out):
-    all_state[session] = {}
-    all_state[session]['out'] = last_out
-    all_state[session]['rnn'] = copy.deepcopy(model_state)
-    all_state[session]['token'] = copy.deepcopy(model_tokens)
-
+all_state[CHAT_INIT] = deepcopy(pipeline.generate(
+    INIT_PROMPT, token_count=200, args=args)[1])
 
-def load_all_stat(session: str):
-    global model_tokens, model_state
+chat_locker = Lock()
 
-    if session not in all_state:
-        out = load_all_stat(INIT_SESSION)
-        save_all_stat(session, out)
 
-    model_state = copy.deepcopy(all_state[session]['rnn'])
-    model_tokens = copy.deepcopy(all_state[session]['token'])
-    return all_state[session]['out']
+def chat(session: str, text: str, token_count: int = 50) -> str:
+    with chat_locker:
+        state = all_state[session] if session in all_state else deepcopy(
+            all_state[CHAT_INIT])
+        ctx = CHAT_FORMAT.format(text)
+        out, state = pipeline.generate(
+            ctx, token_count=token_count, args=args, state=state)
+        all_state[session] = deepcopy(state)
+        return out
 
 
-def del_all_stat(session: str):
+def del_session(session: str):
     if session in all_state:
         del all_state[session]
 
-########################################################################################################
-
-
-# Run inference
-print(f'\nRun prompt...')
-
-out = run_rnn(tokenizer.encode(init_prompt))
-save_all_stat(INIT_SESSION, out)
-gc.collect()
-torch.cuda.empty_cache()
-
-print(f'### prompt ###\n[{tokenizer.decode(model_tokens)}]\n')
-
-chat_locker = Lock()
-
-
-def answer(session: str, text: str):
-    with chat_locker:
-        global model_tokens, model_state
-
-        out = load_all_stat(session)
-        new = f"{user}{interface}{text}\n\n{bot}{interface}"
-        out = run_rnn(tokenizer.encode(new), newline_adj=-999999999)
-        save_all_stat(session, out)
-
-        ans = ''
-        begin = len(model_tokens)
-        out_last = begin
-        for i in range(999):
-            if i <= 0:
-                newline_adj = -999999999
-            elif i <= CHAT_LEN_SHORT:
-                newline_adj = (i - CHAT_LEN_SHORT) / 10
-            elif i <= CHAT_LEN_LONG:
-                newline_adj = 0
-            else:
-                newline_adj = (i - CHAT_LEN_LONG) * \
-                    0.25  # MUST END THE GENERATION
-            token = tokenizer.sample_logits(
-                out,
-                model_tokens,
-                args.ctx_len,
-                temperature=GEN_TEMP,
-                top_p=GEN_TOP_P,
-            )
-            out = run_rnn([token], newline_adj=newline_adj)
-            xxx = tokenizer.decode(model_tokens[out_last:])
-            if '\ufffd' not in xxx:  # avoid utf-8 display issues
-                ans += xxx
-                # print(xxx, end='', flush=True)
-                out_last = begin + i + 1
-
-            send_msg = tokenizer.decode(model_tokens[begin:])
-            if '\n\n' in send_msg:
-                send_msg = send_msg.strip()
-                break
-
-            # send_msg = tokenizer.decode(model_tokens[begin:]).strip()
-            # if send_msg.endswith(f'{user}{interface}'): # warning: needs to fix state too !!!
-            #     send_msg = send_msg[:-len(f'{user}{interface}')].strip()
-            #     break
-            # if send_msg.endswith(f'{bot}{interface}'):
-            #     send_msg = send_msg[:-len(f'{bot}{interface}')].strip()
-            #     break
-
-        # print(f'{model_tokens}')
-        # print(f'[{tokenizer.decode(model_tokens)}]')
-
-        save_all_stat(session, out)
-        return ans.strip()
-
 
 if __name__ == "__main__":
     while True:
-        session = 1
+        session = "main"
         text = input('text:')
-        answer(session, text)
+        result = chat(session, text)
+        print(result)