# transformers调用本地模型 from transformers import AutoModelForCausalLM, AutoTokenizer,GenerationConfig import torch from transformers import __version__ as tf_version import torch import triton import triton.language as tl from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "/data1/lingyun/model/QwenQwen3-32B-AWQ" # load the tokenizer and the model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) # prepare the model input prompt = "你好,2024年有什么大事件发生?" messages = [ {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # conduct text completion generated_ids = model.generate( **model_inputs, max_new_tokens=32768 ) output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() # parsing thinking content try: # rindex finding 151668 () index = len(output_ids) - output_ids[::-1].index(151668) except ValueError: index = 0 thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") print("thinking content:", thinking_content) print("content:", content) # print("Triton 版本:", triton.__version__) # if hasattr(tl, 'interleave'): # print(" 当前 Triton 版本支持 tl.interleave") # else: # print("当前 Triton 版本不支持 tl.interleave") # # print(dir(triton.language)) # print(f"Transformers: {tf_version}, PyTorch: {torch.__version__}") # model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ" # def llm_answer(system_content, choose_model=model_path, user_content='', temperature=0.1, max_tokens=128, stream=False): # print(1111,system_content,"\n",2222,user_content) # tokenizer = AutoTokenizer.from_pretrained( # choose_model, # trust_remote_code=True # DeepSeek需要此参数 # ) # model = AutoModelForCausalLM.from_pretrained( # choose_model, # device_map="auto", # 自动选择GPU/CPU # torch_dtype="auto", # 自动选择精度 # trust_remote_code=True # 允许执行模型自定义代码 # ) # # 3. 修复pad_token问题(关键步骤) # if tokenizer.pad_token is None: # tokenizer.pad_token = tokenizer.eos_token # 使用eos_token作为pad_token # model.config.pad_token_id = tokenizer.pad_token_id # 同步模型配置 # messages=[{"role": "system", "content": "你好,2024年有什么大事件发生?"}, # ] # inputs_ids = tokenizer.apply_chat_template( # messages, # tokenize=False, # add_generation_prompt=True, # enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. # ) # model_inputs = tokenizer([inputs_ids], return_tensors="pt").to(model.device) # # conduct text completion # generated_ids = model.generate( # model_inputs, # max_new_tokens=32768 # ) # outputs = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() # i# parsing thinking content # try: # # rindex finding 151668 () # index = len(output_ids) - output_ids[::-1].index(151668) # except ValueError: # index = 0 # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") # return thinking_content,content # if __name__ == '__main__': # system_content = "你好,2024年有什么大事件发生?" # model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ" # thinking_content,content = llm_answer(system_content,model_path) # print(thinking_content,"\n","\n") # print(content)