113 lines
4.0 KiB
Python
113 lines
4.0 KiB
Python
![]() |
# transformers调用本地模型
|
|||
|
from transformers import AutoModelForCausalLM, AutoTokenizer,GenerationConfig
|
|||
|
import torch
|
|||
|
from transformers import __version__ as tf_version
|
|||
|
import torch
|
|||
|
import triton
|
|||
|
import triton.language as tl
|
|||
|
|
|||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
|
|
|||
|
model_name = "/data1/lingyun/model/QwenQwen3-32B-AWQ"
|
|||
|
|
|||
|
# load the tokenizer and the model
|
|||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|||
|
model = AutoModelForCausalLM.from_pretrained(
|
|||
|
model_name,
|
|||
|
torch_dtype="auto",
|
|||
|
device_map="auto"
|
|||
|
)
|
|||
|
|
|||
|
# prepare the model input
|
|||
|
prompt = "你好,2024年有什么大事件发生?"
|
|||
|
messages = [
|
|||
|
{"role": "user", "content": prompt}
|
|||
|
]
|
|||
|
text = tokenizer.apply_chat_template(
|
|||
|
messages,
|
|||
|
tokenize=False,
|
|||
|
add_generation_prompt=True,
|
|||
|
enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
|
|||
|
)
|
|||
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
|||
|
|
|||
|
# conduct text completion
|
|||
|
generated_ids = model.generate(
|
|||
|
**model_inputs,
|
|||
|
max_new_tokens=32768
|
|||
|
)
|
|||
|
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
|||
|
|
|||
|
# parsing thinking content
|
|||
|
try:
|
|||
|
# rindex finding 151668 (</think>)
|
|||
|
index = len(output_ids) - output_ids[::-1].index(151668)
|
|||
|
except ValueError:
|
|||
|
index = 0
|
|||
|
|
|||
|
thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
|
|||
|
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
|
|||
|
|
|||
|
print("thinking content:", thinking_content)
|
|||
|
print("content:", content)
|
|||
|
|
|||
|
|
|||
|
# print("Triton 版本:", triton.__version__)
|
|||
|
# if hasattr(tl, 'interleave'):
|
|||
|
# print(" 当前 Triton 版本支持 tl.interleave")
|
|||
|
# else:
|
|||
|
# print("当前 Triton 版本不支持 tl.interleave")
|
|||
|
# # print(dir(triton.language))
|
|||
|
# print(f"Transformers: {tf_version}, PyTorch: {torch.__version__}")
|
|||
|
# model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
|
|||
|
# def llm_answer(system_content, choose_model=model_path, user_content='', temperature=0.1, max_tokens=128, stream=False):
|
|||
|
|
|||
|
# print(1111,system_content,"\n",2222,user_content)
|
|||
|
# tokenizer = AutoTokenizer.from_pretrained(
|
|||
|
# choose_model,
|
|||
|
# trust_remote_code=True # DeepSeek需要此参数
|
|||
|
# )
|
|||
|
# model = AutoModelForCausalLM.from_pretrained(
|
|||
|
# choose_model,
|
|||
|
# device_map="auto", # 自动选择GPU/CPU
|
|||
|
# torch_dtype="auto", # 自动选择精度
|
|||
|
# trust_remote_code=True # 允许执行模型自定义代码
|
|||
|
# )
|
|||
|
|
|||
|
# # 3. 修复pad_token问题(关键步骤)
|
|||
|
# if tokenizer.pad_token is None:
|
|||
|
# tokenizer.pad_token = tokenizer.eos_token # 使用eos_token作为pad_token
|
|||
|
# model.config.pad_token_id = tokenizer.pad_token_id # 同步模型配置
|
|||
|
# messages=[{"role": "system", "content": "你好,2024年有什么大事件发生?"},
|
|||
|
# ]
|
|||
|
# inputs_ids = tokenizer.apply_chat_template(
|
|||
|
# messages,
|
|||
|
# tokenize=False,
|
|||
|
# add_generation_prompt=True,
|
|||
|
# enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
|
|||
|
# )
|
|||
|
# model_inputs = tokenizer([inputs_ids], return_tensors="pt").to(model.device)
|
|||
|
# # conduct text completion
|
|||
|
# generated_ids = model.generate(
|
|||
|
# model_inputs,
|
|||
|
# max_new_tokens=32768
|
|||
|
# )
|
|||
|
# outputs = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
|||
|
# i# parsing thinking content
|
|||
|
# try:
|
|||
|
# # rindex finding 151668 (</think>)
|
|||
|
# index = len(output_ids) - output_ids[::-1].index(151668)
|
|||
|
# except ValueError:
|
|||
|
# index = 0
|
|||
|
|
|||
|
# thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
|
|||
|
# content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
|
|||
|
|
|||
|
# return thinking_content,content
|
|||
|
|
|||
|
# if __name__ == '__main__':
|
|||
|
# system_content = "你好,2024年有什么大事件发生?"
|
|||
|
# model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
|
|||
|
# thinking_content,content = llm_answer(system_content,model_path)
|
|||
|
# print(thinking_content,"\n","\n")
|
|||
|
# print(content)
|