<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://wiki.gacrc.uga.edu/api.php?action=feedcontributions&amp;feedformat=atom&amp;user=Kstanier</id>
	<title>Research Computing Center Wiki - User contributions [en]</title>
	<link rel="self" type="application/atom+xml" href="https://wiki.gacrc.uga.edu/api.php?action=feedcontributions&amp;feedformat=atom&amp;user=Kstanier"/>
	<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/wiki/Special:Contributions/Kstanier"/>
	<updated>2026-05-17T08:24:01Z</updated>
	<subtitle>User contributions</subtitle>
	<generator>MediaWiki 1.39.7</generator>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21962</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21962"/>
		<updated>2024-07-04T20:14:30Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: /* Compute Resources */&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=== Compute Resources ===&lt;br /&gt;
52,002 instruct articles for 3 epochs against Meta-Llama-3-8B loaded in 4bit with the PEFT library&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+Tested Accelerators&lt;br /&gt;
!Vendor&lt;br /&gt;
!Product&lt;br /&gt;
!Backend&lt;br /&gt;
!VRAM (GB)&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia&lt;br /&gt;
|L4&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |CUDA&lt;br /&gt;
|24&lt;br /&gt;
|-&lt;br /&gt;
|A100&lt;br /&gt;
| rowspan=&amp;quot;2&amp;quot; |80&lt;br /&gt;
|-&lt;br /&gt;
|H100&lt;br /&gt;
|-&lt;br /&gt;
|AMD&lt;br /&gt;
|MI210&lt;br /&gt;
|ROCm&lt;br /&gt;
|64&lt;br /&gt;
|}&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!# of Acc.&lt;br /&gt;
!Acc. Hardware&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
!VRAM Usage (GB / %)&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |&#039;&#039;Pending&#039;&#039;&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |Nvidia A100&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |1x&lt;br /&gt;
|&#039;&#039;Pending&#039;&#039;&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|2h50m25s&lt;br /&gt;
|PDBS: 5&lt;br /&gt;
|~70.704 / 88.38%&lt;br /&gt;
|-&lt;br /&gt;
|2h32m32s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~76.472 / 95.59%&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|1h14s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~224.384 / 93.49%&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|47m15s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~306.824 / 95.88%&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia H100&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |&#039;&#039;Planned&#039;&#039;&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export TRAINING_BASE_MODEL=&amp;quot;Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=1&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -m $TRAINING_BASE_MODEL -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS -s $MAX_SEQ_LENGTH&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21961</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21961"/>
		<updated>2024-07-04T14:50:05Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: /* Compute Resources */&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=== Compute Resources ===&lt;br /&gt;
52,002 instruct articles for 3 epochs against Meta-Llama-3-8B loaded in 4bit with the PEFT library&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+Tested Accelerators&lt;br /&gt;
!Vendor&lt;br /&gt;
!Product&lt;br /&gt;
!Backend&lt;br /&gt;
!VRAM (GB)&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia&lt;br /&gt;
|L4&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |CUDA&lt;br /&gt;
|24&lt;br /&gt;
|-&lt;br /&gt;
|A100&lt;br /&gt;
| rowspan=&amp;quot;2&amp;quot; |80&lt;br /&gt;
|-&lt;br /&gt;
|H100&lt;br /&gt;
|-&lt;br /&gt;
|AMD&lt;br /&gt;
|MI210&lt;br /&gt;
|ROCm&lt;br /&gt;
|24&lt;br /&gt;
|}&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!# of Acc.&lt;br /&gt;
!Acc. Hardware&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
!VRAM Usage (GB / %)&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |&#039;&#039;Pending&#039;&#039;&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |Nvidia A100&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |1x&lt;br /&gt;
|&#039;&#039;Pending&#039;&#039;&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|2h50m25s&lt;br /&gt;
|PDBS: 5&lt;br /&gt;
|~70.704 / 88.38%&lt;br /&gt;
|-&lt;br /&gt;
|2h32m32s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~76.472 / 95.59%&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|1h14s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~224.384 / 93.49%&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|47m15s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|~306.824 / 95.88%&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia H100&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |&#039;&#039;Planned&#039;&#039;&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|&lt;br /&gt;
|&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export TRAINING_BASE_MODEL=&amp;quot;Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=1&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -m $TRAINING_BASE_MODEL -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS -s $MAX_SEQ_LENGTH&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21960</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21960"/>
		<updated>2024-07-04T03:01:39Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: /* Compute Resources */&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=== Compute Resources ===&lt;br /&gt;
52,002 instruct articles for 3 epochs against Meta-Llama-3-8B loaded in 4bit with the PEFT library&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+Tested Accelerators&lt;br /&gt;
!Vendor&lt;br /&gt;
!Product&lt;br /&gt;
!Backend&lt;br /&gt;
!VRAM (GB)&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia&lt;br /&gt;
|L4&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |CUDA&lt;br /&gt;
|24&lt;br /&gt;
|-&lt;br /&gt;
|A100&lt;br /&gt;
| rowspan=&amp;quot;2&amp;quot; |80&lt;br /&gt;
|-&lt;br /&gt;
|H100&lt;br /&gt;
|-&lt;br /&gt;
|AMD&lt;br /&gt;
|MI210&lt;br /&gt;
|ROCm&lt;br /&gt;
|24&lt;br /&gt;
|}&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!# of Acc.&lt;br /&gt;
!Acc. Hardware&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |Pending&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|Nvidia L4&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|AMD MI210&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;6&amp;quot; |Nvidia A100&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |1x&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|2h50m25s&lt;br /&gt;
|PDBS: 5&lt;br /&gt;
|-&lt;br /&gt;
|2h32m32s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|47m15s&lt;br /&gt;
|PDBS: 7&lt;br /&gt;
|-&lt;br /&gt;
|1x&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Nvidia H100&lt;br /&gt;
|&lt;br /&gt;
| rowspan=&amp;quot;3&amp;quot; |Planned&lt;br /&gt;
|-&lt;br /&gt;
|3x&lt;br /&gt;
|&lt;br /&gt;
|-&lt;br /&gt;
|4x&lt;br /&gt;
|&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export TRAINING_BASE_MODEL=&amp;quot;Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=1&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -m $TRAINING_BASE_MODEL -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS -s $MAX_SEQ_LENGTH&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21959</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21959"/>
		<updated>2024-07-03T17:01:16Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: /* Job Submission Script */&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Compute Resources===&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!Identifier&lt;br /&gt;
!Accelerator Resources&lt;br /&gt;
!Methods&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMIG.A100&lt;br /&gt;
|&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xL4&lt;br /&gt;
|1 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xL4&lt;br /&gt;
|3 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xL4&lt;br /&gt;
|4 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMI210&lt;br /&gt;
|1 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xMI210&lt;br /&gt;
|3 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit)&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, ORPOTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export TRAINING_BASE_MODEL=&amp;quot;Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=1&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -m $TRAINING_BASE_MODEL -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS -s $MAX_SEQ_LENGTH&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21958</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21958"/>
		<updated>2024-07-03T14:40:35Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: /* Job Submission Script */&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Compute Resources===&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!Identifier&lt;br /&gt;
!Accelerator Resources&lt;br /&gt;
!Methods&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMIG.A100&lt;br /&gt;
|&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xL4&lt;br /&gt;
|1 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xL4&lt;br /&gt;
|3 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xL4&lt;br /&gt;
|4 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMI210&lt;br /&gt;
|1 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xMI210&lt;br /&gt;
|3 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit)&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, ORPOTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=1&lt;br /&gt;
&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21957</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21957"/>
		<updated>2024-07-03T14:39:52Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: sanitized&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Compute Resources===&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!Identifier&lt;br /&gt;
!Accelerator Resources&lt;br /&gt;
!Methods&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMIG.A100&lt;br /&gt;
|&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xL4&lt;br /&gt;
|1 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xL4&lt;br /&gt;
|3 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xL4&lt;br /&gt;
|4 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMI210&lt;br /&gt;
|1 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xMI210&lt;br /&gt;
|3 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit)&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, ORPOTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/$$YOUR_MYID/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/$$YOUR_MYID/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_1xA100&lt;br /&gt;
#SBATCH --cpus-per-task=16&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:1&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=64gb&lt;br /&gt;
#SBATCH --time=03:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=$$YOUR_MYID@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;$$YOUR_MYID&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;$$YOUR_LAB&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=4&lt;br /&gt;
&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/$$YOUR_MYID/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
	<entry>
		<id>https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21956</id>
		<title>LLM Training</title>
		<link rel="alternate" type="text/html" href="https://wiki.gacrc.uga.edu/index.php?title=LLM_Training&amp;diff=21956"/>
		<updated>2024-07-03T14:35:30Z</updated>

		<summary type="html">&lt;p&gt;Kstanier: Added initial training workflow details&lt;/p&gt;
&lt;hr /&gt;
&lt;div&gt;===Introduction===&lt;br /&gt;
&lt;br /&gt;
===HuggingFace===&lt;br /&gt;
&lt;br /&gt;
==== Hub ====&lt;br /&gt;
&lt;br /&gt;
==== Libraries ====&lt;br /&gt;
&lt;br /&gt;
===== Transformers =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load Transformers&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== Datasets =====&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
module load datasets&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===== TRL =====&lt;br /&gt;
(software module coming soon, can be installed in venv meanwhile)&amp;lt;syntaxhighlight lang=&amp;quot;bash&amp;quot;&amp;gt;&lt;br /&gt;
python3 -m venv ~/trl_venv&lt;br /&gt;
source ~/trl_venv/bin/activate&lt;br /&gt;
pip install --require-virtualenv trl&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Compute Resources===&lt;br /&gt;
{| class=&amp;quot;wikitable&amp;quot;&lt;br /&gt;
|+LLM Training Compute Resource Consumption&lt;br /&gt;
!Identifier&lt;br /&gt;
!Accelerator Resources&lt;br /&gt;
!Methods&lt;br /&gt;
!Training Duration&lt;br /&gt;
!Notes&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMIG.A100&lt;br /&gt;
|&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xL4&lt;br /&gt;
|1 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xL4&lt;br /&gt;
|3 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xL4&lt;br /&gt;
|4 * Nvidia L4 (24GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xMI210&lt;br /&gt;
|1 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xMI210&lt;br /&gt;
|3 * AMD MI210 (64GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit)&lt;br /&gt;
|&lt;br /&gt;
|Pending&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xA100&lt;br /&gt;
|1 * Nvidia A100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|PDBS: 1&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.1xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.3xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|-&lt;br /&gt;
|alpaca.4xH100&lt;br /&gt;
|1 * Nvidia H100 (80GB VRAM each)&lt;br /&gt;
|LoRA (4-bit), FlashAttention2&lt;br /&gt;
|&lt;br /&gt;
|Planned&lt;br /&gt;
|}&lt;br /&gt;
&lt;br /&gt;
===Training Script (w/HuggingFace)===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;python3&amp;quot; line=&amp;quot;1&amp;quot;&amp;gt;&lt;br /&gt;
import torch&lt;br /&gt;
&lt;br /&gt;
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments&lt;br /&gt;
from datasets import load_from_disk&lt;br /&gt;
from trl import SFTTrainer, ORPOTrainer, AutoModelForCausalLMWithValueHead, ModelConfig, get_peft_config, get_quantization_config, get_kbit_device_map, DataCollatorForCompletionOnlyLM&lt;br /&gt;
from peft import PeftModel, TaskType, LoraConfig, get_peft_model&lt;br /&gt;
 &lt;br /&gt;
base_model = &amp;quot;/scratch/ks98810/llm/models/hf/Meta-Llama-3-8B&amp;quot;&lt;br /&gt;
output_dir = &amp;quot;/lscratch/ks98810/guac0/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
report_to = &amp;quot;wandb&amp;quot;&lt;br /&gt;
&lt;br /&gt;
attn_implementation = &amp;quot;flash_attention_2&amp;quot;&lt;br /&gt;
&lt;br /&gt;
def prompt_formatting_func(self, article):&lt;br /&gt;
    output_texts = []&lt;br /&gt;
&lt;br /&gt;
    for i in range(len(article[&#039;hash&#039;])):&lt;br /&gt;
        text = &amp;quot;Below is an instruction that describes a task. Write a response that appropriately completes the request.\n{% for message in messages %}\n{% if message[&#039;role&#039;] == &#039;system&#039; %}### Instruction:\n{% elif message[&#039;role&#039;] == &#039;user&#039; %}### Input:\n{% elif message[&#039;role&#039;] == &#039;assistant&#039; %}### Response:\n{% endif %}{{message[&#039;content&#039;]}}\n{% endfor %}\n### Response:\n&amp;quot;.render(messages = article[&#039;messages&#039;][i])&lt;br /&gt;
        output_texts.append(text)&lt;br /&gt;
    return output_texts&lt;br /&gt;
&lt;br /&gt;
if __name__ == &amp;quot;__main__&amp;quot;:&lt;br /&gt;
    # Basic model config&lt;br /&gt;
    model_config = ModelConfig(&lt;br /&gt;
        model_name_or_path      = base_model,&lt;br /&gt;
        attn_implementation     = attn_implementation,&lt;br /&gt;
    )&lt;br /&gt;
    quant_config = get_quantization_config(model_config)&lt;br /&gt;
&lt;br /&gt;
    model_kwargs = dict(&lt;br /&gt;
        torch_dtype         = &amp;quot;auto&amp;quot;,&lt;br /&gt;
        load_in_4bit        = True,&lt;br /&gt;
        trust_remote_code   = False, # Don&#039;t&lt;br /&gt;
        attn_implementation = attn_implementation,&lt;br /&gt;
        use_cache           = False, # false if grad chkpnting&lt;br /&gt;
        quantization_config = get_quantization_config(model_config),&lt;br /&gt;
        device_map          = get_kbit_device_map(),&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    # Load model &amp;amp; tokenizer&lt;br /&gt;
    tokenizer  = AutoTokenizer.from_pretrained(model_path)&lt;br /&gt;
    model      = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)&lt;br /&gt;
&lt;br /&gt;
    tokenizer.pad_token = tokenizer.eos_token&lt;br /&gt;
&lt;br /&gt;
    lora_config = LoraConfig(&lt;br /&gt;
        r               = 64,&lt;br /&gt;
        lora_alpha      = 16,&lt;br /&gt;
        lora_dropout    = 0.05,&lt;br /&gt;
        bias            = &amp;quot;none&amp;quot;,&lt;br /&gt;
        task_type       = &amp;quot;CAUSAL_LM&amp;quot;,&lt;br /&gt;
    )&lt;br /&gt;
    model = get_peft_model(model, lora_config)&lt;br /&gt;
&lt;br /&gt;
    train_dataset = load_from_disk(&amp;quot;/scratch/ks98810/llm/datasets/guac-merge0&amp;quot;)&lt;br /&gt;
    training_args = TrainingArguments(&lt;br /&gt;
        logging_strategy            = &amp;quot;steps&amp;quot;,&lt;br /&gt;
        logging_steps               = 500,&lt;br /&gt;
        logging_first_step          = True,&lt;br /&gt;
        report_to                   = report_to,&lt;br /&gt;
        num_train_epochs            = 3,&lt;br /&gt;
        output_dir                  = kwargs.get(&amp;quot;output_dir&amp;quot;, defaults[&amp;quot;output_path&amp;quot;]),&lt;br /&gt;
        per_device_train_batch_size = 1,&lt;br /&gt;
        learning_rate               = 2e-4,&lt;br /&gt;
    )&lt;br /&gt;
    response_template = &amp;quot;### Response:\n&amp;quot;&lt;br /&gt;
    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer = tokenizer)&lt;br /&gt;
    trainer = SFTTrainer(&lt;br /&gt;
        model,&lt;br /&gt;
        args                = training_args,&lt;br /&gt;
        train_dataset       = train_dataset,&lt;br /&gt;
        dataset_text_field  = &amp;quot;text&amp;quot;,&lt;br /&gt;
        max_seq_length      = 4096,&lt;br /&gt;
        peft_config         = lora_config,&lt;br /&gt;
        formatting_func     = prompt_formatting_func,&lt;br /&gt;
        data_collator       = collator,&lt;br /&gt;
    )&lt;br /&gt;
&lt;br /&gt;
    trainer.train()&lt;br /&gt;
    trainer.save_model(kwargs.get(&amp;quot;output_dir&amp;quot;, output_dir))&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;br /&gt;
&lt;br /&gt;
===Job Submission Script===&lt;br /&gt;
&amp;lt;syntaxhighlight lang=&amp;quot;slurm&amp;quot;&amp;gt;&lt;br /&gt;
#!/usr/bin/env bash&lt;br /&gt;
#SBATCH --job-name=train_guac0_4xA100&lt;br /&gt;
#SBATCH --cpus-per-task=32&lt;br /&gt;
#SBATCH --partition=gpu_p&lt;br /&gt;
#SBATCH --gres=gpu:A100:4&lt;br /&gt;
#SBATCH --ntasks=1&lt;br /&gt;
#SBATCH --mem=256gb&lt;br /&gt;
#SBATCH --time=06:00:00&lt;br /&gt;
#SBATCH --output=logs/%x.%j.out&lt;br /&gt;
#SBATCH --error=logs/%x.%j.err&lt;br /&gt;
&lt;br /&gt;
#SBATCH --mail-type=ALL&lt;br /&gt;
#SBATCH --mail-user=ks98810@uga.edu&lt;br /&gt;
&lt;br /&gt;
export JOB_CUSTODIAN=&amp;quot;ks98810&amp;quot;&lt;br /&gt;
export JOB_GROUP=&amp;quot;gclab&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_DIR=&amp;quot;/work/$JOB_GROUP/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export SCRATCH_DIR=&amp;quot;/scratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
export LSCRATCH_DIR=&amp;quot;/lscratch/$JOB_CUSTODIAN/&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_NAME=&amp;quot;guac0&amp;quot;&lt;br /&gt;
export PROJECT_VARIANT=&amp;quot;flash-attn0&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export PROJECT_TITLE=&amp;quot;$PROJECT_NAME.$PROJECT_VARIANT.$SLURM_JOBID&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export RESULT_DEPOT=&amp;quot;$SCRATCH_DIR/llm/models/hf/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
export TRAINING_OUTPUT=&amp;quot;$LSCRATCH_DIR/$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export OMP_NUM_THREADS=16&lt;br /&gt;
&lt;br /&gt;
export PER_DEVICE_BATCH_SIZE=1&lt;br /&gt;
export GPUS_PER_NODE=4&lt;br /&gt;
&lt;br /&gt;
export TRAINING_EPOCHS=3&lt;br /&gt;
&lt;br /&gt;
export TRAINING_VENV=&amp;quot;/scratch/ks98810/llm/projects/workbench/venv/&amp;quot;&lt;br /&gt;
export TRAINING_SCRIPT=&amp;quot;/scratch/ks98810/llm/projects/guac/scripts/training/train_guac0.py&amp;quot;&lt;br /&gt;
export TRAINING_ARGS=&amp;quot;-b $PER_DEVICE_BATCH_SIZE -o $TRAINING_OUTPUT -e $TRAINING_EPOCHS&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export WANDB_PROJECT=&amp;quot;$PROJECT_NAME&amp;quot;&lt;br /&gt;
export WANDB_LOG_MODEL=&amp;quot;checkpoint&amp;quot;&lt;br /&gt;
export WANDB_JOB_TYPE=&amp;quot;training&amp;quot;&lt;br /&gt;
export WANDB_NAME=&amp;quot;$PROJECT_TITLE&amp;quot;&lt;br /&gt;
&lt;br /&gt;
export CUDA_VERSION=&amp;quot;12.1.1&amp;quot;&lt;br /&gt;
export RDZV_BACKEND=&amp;quot;c10d&amp;quot;&lt;br /&gt;
export RDZV_ID=2299&lt;br /&gt;
export RDZV_PORT=29500&lt;br /&gt;
&lt;br /&gt;
cd $SLURM_SUBMIT_DIR&lt;br /&gt;
&lt;br /&gt;
module load CUDA/$CUDA_VERSION diffusers ccache wandb flash-attn&lt;br /&gt;
&lt;br /&gt;
head_node_ip=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)&lt;br /&gt;
&lt;br /&gt;
export LAUNCHER=&amp;quot;python -m torch.distributed.run \&lt;br /&gt;
        --nproc_per_node $GPUS_PER_NODE \&lt;br /&gt;
        --nnodes $SLURM_NNODES \&lt;br /&gt;
        --rdzv_id $RDZV_ID \&lt;br /&gt;
        --rdzv_backend $RDZV_BACKEND \&lt;br /&gt;
        --rdzv_endpoint $head_node_ip:$RDZV_PORT \&lt;br /&gt;
&amp;quot;&lt;br /&gt;
&lt;br /&gt;
source $TRAINING_VENV/bin/activate&lt;br /&gt;
&lt;br /&gt;
export CMD=&amp;quot;$LAUNCHER $TRAINING_SCRIPT $TRAINING_ARGS&amp;quot;&lt;br /&gt;
srun --jobid $SLURM_JOB_ID bash -c &amp;quot;$CMD&amp;quot;&lt;br /&gt;
&lt;br /&gt;
deactivate&lt;br /&gt;
rsync -r $TRAINING_OUTPUT $RESULT_DEPOT&lt;br /&gt;
&lt;br /&gt;
&amp;lt;/syntaxhighlight&amp;gt;&lt;/div&gt;</summary>
		<author><name>Kstanier</name></author>
	</entry>
</feed>