본문 바로가기
데이터분석/Pytorch

unsloth 을 활용한 llama 모델 동작 시키기.

by Kc93 2025. 2. 25.
728x90
반응형
import transformers
from transformers import AutoTokenizer

# 모델 초기화
model_id = "unsloth/llama-3-8b-Instruct-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    device_map="auto"
)
# 문제 불러오기
english_1 = """
빈칸에 알맞는 것을 고르시오.
The founding population of our direct ancestors is not
thought to have been much larger than 2,000 individuals;
some think the group was as small as a few hundred. How,
then, did we go from such a fragile minority population to a
tide of humanity 7 billion strong and growing? There is only
one way, according to Richard Potts. You give up on
___________. You don't try to beat back the changes. You
begin to care about consistency within a given habitate,
because such consistency isn't an option. You adapt to
variation itself. It was a brilliant strategy. Instead of
learning how to survive in just one or two ecological
environments, we took on the entire globe.

1번 "stability" 2번 "morality" 3번 "fairness" 4번 "reputation" 5번 "challenges"
"""

english_2 = """대화의 빈 칸에 알맞은 것을 고르세요.
A : Happy birthday!
B : Oh, ____________

1번 "sorry" 2번 "thank you" 3번 "June first"
 """

english_3 = """주어진 단어에 포함되는 단어를 모두 고르시오.
주어진 단어 : "Pet"
1번 "cat" 2번 "picture" 3번 "rabbit" 4번 "toothpaste"
"""

korean_1 = """동물 친구들이 곰에게 글을 읽을 때의 바른 자세에 대하여 말하고 있습니다. 토끼는 의자를 당겨서 앉아야 한다고 하였습니다. 기린은 허리를 곧게 펴야 한다고 하였습니다. 생쥐는 책과 눈의 거리를 알맞게 해야 한다고 하였습니다.

1. 토끼가 곰에게 해 준 말은 무엇입니까?
1번 "허리를 곧게 펴야 한다." 2번 "의자를 당겨서 앉아야 한다." 3번 "다리를 쭉 펴고 읽어야 한다." 4번 "책상 위에 엎드려 읽어야 한다." 5번 "책과 팔의 거리를 알맞게 해야 한다."
"""

korean_2 = """다음 중 자음자를 고르세요.
1번 "" 2번 "" 3번 "" 4번 "" 5번 ""
"""

korean_3 = """나무 나무 무슨 나무
가자 가자 감나무
배가 아파 배나무
바람 솔솔 _______

빈칸에 들어갈 말로 가장 알맞는 것은 무엇입니까?
1번 "소나무" 2번 "감나무" 3번 "배나무" 4번 "사과나무" 5번 "포도나무"
"""
프롬프트 동작 시키기.
message = [
    {"role": "system", "content": "You are a helpful assistant chatbot."},
    {"role": "user", "content": english_1 + "Speak as a teacher speaks to a student. Please tell me the correct answer to the problem and provide an appropriate explanation. Please speak Korean."}
]
prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True, tokenize=False)

import time

start = time.time()

# Generate text
sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    num_return_sequences=1,
    max_length=2000,
)
print(sequences[0]['generated_text'])

end = time.time()

print(f"{end - start:.5f} sec")
Fine Tuning 
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower
model_id = "unsloth/llama-3-8b-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")


from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])
import time

start = time.time()

trainer_stats = trainer.train()

end = time.time()

print(f"{end - start:.5f} sec")
반응형

댓글