-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfinetune_multi_node.sh
More file actions
executable file
·103 lines (91 loc) · 3.15 KB
/
finetune_multi_node.sh
File metadata and controls
executable file
·103 lines (91 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
export
# --- 1. Get model and dataset name from command line ---
BASE_MODEL=$1 # qwen_3b
DATASET=$2 # citeseer
MP=$3 # mp layer
C_RATIO=0.1 # compression ratio
LORA_RANK=64
LORA_ALPHA=64
LORA_DROPOUT=0.1
EPOCH=3
echo ">>>>>>>>>>>>> Important Parameters: >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> Base Model: $BASE_MODEL >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> Dataset: $DATASET >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> MP Layer Num: $MP >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> Compression RATIO: $C_RATIO >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> LORA_RANK: $LORA_RANK >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> LORA_ALPHA: $LORA_ALPHA >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> LORA_DROPOUT: $LORA_DROPOUT >>>>>>>>>>>>>"
###### setting for model ######
echo "Finetuning on model: $BASE_MODEL"
case "$BASE_MODEL" in
qwen_7b)
BASE_MODEL_DIR="./output/pretrain/${BASE_MODEL}/pt_academic_mp_${MP}_ratio_${C_RATIO}/"
;;
# Add more model mappings here...
# *) is the default case for handling unknown model names
*)
echo "Error: Unknown model name '$BASE_MODEL'!"
echo "Available options: qwen_7b"
exit 1
;;
esac
###### setting for model ######
echo ">>>>>>>>>>>>> Set Base Model: $BASE_MODEL_DIR >>>>>>>>>>>>>"
###### setting for dataset ######
echo "Finetuing on dataset: $DATASET"
OUTPUT_DIR="./output/finetune/${BASE_MODEL}/${DATASET}/"
FINAL_OUTPUT_DIR="./final_output/finetune/${BASE_MODEL}/${DATASET}/model/"
# ###### scan existing ckpt ######
# if ls -d "${OUTPUT_DIR}/checkpoint"*/ >/dev/null 2>&1; then
# echo "=========================================================="
# echo "Warning: Found existing checkpoint directory in ${OUTPUT_DIR}"
# echo "Script terminated automatically to avoid overwriting existing training progress"
# echo "=========================================================="
# exit 1
# fi
# ###### scan existing ckpt ######
case "$DATASET" in
cora)
TRAIN_DATA="./data_preprocess/cora/processed/finetuned_${DATASET}_v1.json"
EVAL_DATA="./data_preprocess/cora/processed/finetuned_${DATASET}_val_v1.json"
;;
*)
echo "Error: Unknown dataset '$DATASET'!"
;;
esac
###### setting for dataset ######
WORLD_SIZE=${WORLD_SIZE:-1}
NODE_RANK=${RANK:-0}
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
MASTER_PORT=${MASTER_PORT:-12346}
gpus=$(nvidia-smi -L | wc -l)
torchrun --nproc_per_node=$gpus --nnode=$WORLD_SIZE --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT -m main.train \
--output_dir $OUTPUT_DIR \
--final_output_dir $FINAL_OUTPUT_DIR \
--model_name_or_path $BASE_MODEL_DIR \
--train_data $TRAIN_DATA \
--eval_data $EVAL_DATA \
--load_best_model_at_end True \
--eval_steps 20 \
--max_length 8192 \
--min_length 3072 \
--max_train_num_per_data 200000 \
--num_train_epochs $EPOCH \
--enable_beacon True \
--beacon_param q k v o \
--training_stage finetune \
--mp_layer_num $MP \
--c_ratio $C_RATIO \
--lora_rank $LORA_RANK \
--lora_alpha $LORA_ALPHA \
--lora_dropout $LORA_DROPOUT \
--group_by_length \
--gradient_checkpointing \
--save_strategy steps \
--save_steps 20 \
--save_total_limit 2 \
--logging_steps 1 \
--dataset_cache_dir ./cache \
--model_cache_dir ./cache