huaxi-backend-master/test_token.py
2025-08-01 19:12:19 +08:00

113 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# transformers调用本地模型
from transformers import AutoModelForCausalLM, AutoTokenizer,GenerationConfig
import torch
from transformers import __version__ as tf_version
import torch
import triton
import triton.language as tl
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "/data1/lingyun/model/QwenQwen3-32B-AWQ"
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto"
)
# prepare the model input
prompt = "你好2024年有什么大事件发生"
messages = [
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# conduct text completion
generated_ids = model.generate(
**model_inputs,
max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
# parsing thinking content
try:
# rindex finding 151668 (</think>)
index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
index = 0
thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
print("thinking content:", thinking_content)
print("content:", content)
# print("Triton 版本:", triton.__version__)
# if hasattr(tl, 'interleave'):
# print(" 当前 Triton 版本支持 tl.interleave")
# else:
# print("当前 Triton 版本不支持 tl.interleave")
# # print(dir(triton.language))
# print(f"Transformers: {tf_version}, PyTorch: {torch.__version__}")
# model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
# def llm_answer(system_content, choose_model=model_path, user_content='', temperature=0.1, max_tokens=128, stream=False):
# print(1111,system_content,"\n",2222,user_content)
# tokenizer = AutoTokenizer.from_pretrained(
# choose_model,
# trust_remote_code=True # DeepSeek需要此参数
# )
# model = AutoModelForCausalLM.from_pretrained(
# choose_model,
# device_map="auto", # 自动选择GPU/CPU
# torch_dtype="auto", # 自动选择精度
# trust_remote_code=True # 允许执行模型自定义代码
# )
# # 3. 修复pad_token问题关键步骤
# if tokenizer.pad_token is None:
# tokenizer.pad_token = tokenizer.eos_token # 使用eos_token作为pad_token
# model.config.pad_token_id = tokenizer.pad_token_id # 同步模型配置
# messages=[{"role": "system", "content": "你好2024年有什么大事件发生"},
# ]
# inputs_ids = tokenizer.apply_chat_template(
# messages,
# tokenize=False,
# add_generation_prompt=True,
# enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
# )
# model_inputs = tokenizer([inputs_ids], return_tensors="pt").to(model.device)
# # conduct text completion
# generated_ids = model.generate(
# model_inputs,
# max_new_tokens=32768
# )
# outputs = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
# i# parsing thinking content
# try:
# # rindex finding 151668 (</think>)
# index = len(output_ids) - output_ids[::-1].index(151668)
# except ValueError:
# index = 0
# thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
# content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
# return thinking_content,content
# if __name__ == '__main__':
# system_content = "你好2024年有什么大事件发生"
# model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
# thinking_content,content = llm_answer(system_content,model_path)
# print(thinking_content,"\n","\n")
# print(content)