# transformers调用本地模型
from transformers import AutoModelForCausalLM, AutoTokenizer,GenerationConfig
import torch 
from transformers import __version__ as tf_version
import torch
import triton
import triton.language as tl

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "/data1/lingyun/model/QwenQwen3-32B-AWQ"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = "你好，2024年有什么大事件发生？"
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


# print("Triton 版本:", triton.__version__)
# if hasattr(tl, 'interleave'):
#     print(" 当前 Triton 版本支持 tl.interleave")
# else:
#     print("当前 Triton 版本不支持 tl.interleave")
# # print(dir(triton.language))
# print(f"Transformers: {tf_version}, PyTorch: {torch.__version__}")
# model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
# def llm_answer(system_content, choose_model=model_path, user_content='', temperature=0.1, max_tokens=128, stream=False):

#     print(1111,system_content,"\n",2222,user_content)
#     tokenizer = AutoTokenizer.from_pretrained(
#         choose_model,
#         trust_remote_code=True  # DeepSeek需要此参数
#     )
#     model = AutoModelForCausalLM.from_pretrained(
#         choose_model,
#         device_map="auto",       # 自动选择GPU/CPU
#         torch_dtype="auto",      # 自动选择精度
#         trust_remote_code=True   # 允许执行模型自定义代码
#     )

#     # 3. 修复pad_token问题（关键步骤）
#     if tokenizer.pad_token is None:
#         tokenizer.pad_token = tokenizer.eos_token  # 使用eos_token作为pad_token
#     model.config.pad_token_id = tokenizer.pad_token_id  # 同步模型配置
#     messages=[{"role": "system", "content": "你好，2024年有什么大事件发生？"},
#                   ]
#     inputs_ids = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True,
#         enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
#     )
#     model_inputs = tokenizer([inputs_ids], return_tensors="pt").to(model.device)
#     # conduct text completion
#     generated_ids = model.generate(
#         model_inputs,
#         max_new_tokens=32768
#     )
#     outputs = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
#     i# parsing thinking content
#     try:
#         # rindex finding 151668 (</think>)
#         index = len(output_ids) - output_ids[::-1].index(151668)
#     except ValueError:
#         index = 0

#     thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
#     content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

#     return thinking_content,content

# if  __name__ == '__main__':
#     system_content = "你好，2024年有什么大事件发生？"
#     model_path = r"/data1/lingyun/model/QwenQwen3-32B-AWQ"
#     thinking_content,content = llm_answer(system_content,model_path)
#     print(thinking_content,"\n","\n")
#     print(content)