-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpretrain_multi_node.sh
More file actions
executable file
·67 lines (56 loc) · 1.87 KB
/
pretrain_multi_node.sh
File metadata and controls
executable file
·67 lines (56 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
export
BASE_MODEL=$1 # qwen_7b
MP=$2 # mp layer
C_RATIO=$3 # compression ratio default=0.1
echo ">>>>>>>>>>>>> Important Parameters: >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> Base Model: $BASE_MODEL >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> MP Layer Num: $MP >>>>>>>>>>>>>"
echo ">>>>>>>>>>>>> Compression RATIO: $C_RATIO >>>>>>>>>>>>>"
###### setting for model ######
echo "Pretrain on model: $BASE_MODEL"
case "$BASE_MODEL" in
qwen_7b)
BASE_MODEL_DIR="./model/Qwen2.5-7B-Instruct/"
;;
*)
echo "❌ Unknown Model Name '$BASE_MODEL'!"
echo "Available Options: qwen_7b"
exit 1
;;
esac
###### setting for model ######
###### setting for dataset ######
echo "Pretrain on dataset: Academic"
OUTPUT_DIR="./output/pretrain/${BASE_MODEL}/pt_academic_mp_${MP}_ratio_${C_RATIO}/"
echo "Model Output dir: $OUTPUT_DIR"
TRAIN_DATA1="./data/pretrained_data/pretrained_Economics.json"
TRAIN_DATA2="./data/pretrained_data/pretrained_Mathematics.json"
TRAIN_DATA3="./data/pretrained_data/pretrained_Geology.json"
###### setting for dataset ######
EPOCH=1
WORLD_SIZE=${WORLD_SIZE:-1}
NODE_RANK=${RANK:-0}
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
MASTER_PORT=${MASTER_PORT:-12346}
gpus=$(nvidia-smi -L | wc -l)
torchrun --nproc_per_node=$gpus --nnode=$WORLD_SIZE --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT -m main.train \
--output_dir $OUTPUT_DIR \
--model_name_or_path $BASE_MODEL_DIR \
--train_data $TRAIN_DATA1 $TRAIN_DATA2 $TRAIN_DATA3 \
--max_length 8192 \
--min_length 3072 \
--max_train_num_per_data 200000 \
--num_train_epochs $EPOCH \
--enable_beacon True \
--beacon_param q k v o \
--training_stage pretrain \
--mp_layer_num $MP \
--c_ratio $C_RATIO \
--gradient_checkpointing \
--save_strategy steps \
--save_steps 50 \
--save_total_limit 2 \
--logging_steps 1 \
--dataset_cache_dir ./cache \
--model_cache_dir ./cache