-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathrun.sh
More file actions
112 lines (95 loc) · 3.13 KB
/
run.sh
File metadata and controls
112 lines (95 loc) · 3.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
export MODEL_NAME=""
export DESC=""
# Stage 1: Instruction Training
OUTPUT_DIR_STAGE1="./output/selekt_stage1_instruction"
TRAIN_DATA_STAGE1=""
MODEL_PATH=""
# Stage 2: Conversational Training
OUTPUT_DIR_STAGE2="./output/selekt_stage2_conversational"
TRAIN_DATA_STAGE2=""
find_latest_checkpoint() {
local output_dir=$1
local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1)
echo "$latest_checkpoint"
}
echo "Starting Stage 1: SeleKT Instruction Training..."
echo "Model: $MODEL_PATH"
echo "Training data: $TRAIN_DATA_STAGE1"
echo "Output directory: $OUTPUT_DIR_STAGE1"
mkdir -p $OUTPUT_DIR_STAGE1
# Stage 1: Instruction Training
accelerate launch \
--config_file=../configs/general_acc.yaml \
selekt.py \
--model_name_or_path "$MODEL_PATH" \
--train_data_path "$TRAIN_DATA_STAGE1" \
--output_dir ${OUTPUT_DIR_STAGE1} \
--num_train_epochs 3 \
--model_max_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--save_strategy "epoch" \
--save_steps 760 \
--save_total_limit 25 \
--learning_rate 1e-5 \
--warmup_ratio 0.1 \
--weight_decay 0.1 \
--logging_steps 5 \
--lr_scheduler_type "cosine" \
--report_to "wandb" \
--gradient_checkpointing True \
--deepspeed ../configs/ds_config.json \
--bf16 True \
--run_name "${MODEL_NAME}_stage1_instruction" \
--alpha 0.05 \
if [ $? -ne 0 ]; then
echo "Error: Stage 1 training failed!"
exit 1
fi
echo "Stage 1 completed successfully!"
LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1")
if [ -z "$LATEST_CHECKPOINT" ]; then
echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1"
exit 1
fi
echo "Found latest checkpoint: $LATEST_CHECKPOINT"
echo "Starting Stage 2: SeleKT Conversational Training..."
echo "Model: $LATEST_CHECKPOINT"
echo "Training data: $TRAIN_DATA_STAGE2"
echo "Output directory: $OUTPUT_DIR_STAGE2"
mkdir -p $OUTPUT_DIR_STAGE2
# Stage 2: Conversational Training
accelerate launch \
--config_file=../configs/general_acc.yaml \
selekt.py \
--model_name_or_path "${LATEST_CHECKPOINT}" \
--train_data_path "$TRAIN_DATA_STAGE2" \
--output_dir ${OUTPUT_DIR_STAGE2} \
--num_train_epochs 3 \
--model_max_length 16384 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--save_strategy "epoch" \
--save_steps 760 \
--save_total_limit 25 \
--learning_rate 1e-5 \
--warmup_ratio 0.1 \
--weight_decay 0.1 \
--logging_steps 5 \
--lr_scheduler_type "cosine" \
--report_to "wandb" \
--gradient_checkpointing True \
--deepspeed ../configs/ds_config.json \
--bf16 True \
--run_name "${MODEL_NAME}_stage2_conversational" \
--alpha 0.05 \
--is_conversational_training \
# Check if stage 2 completed successfully
if [ $? -ne 0 ]; then
echo "Error: Stage 2 training failed!"
exit 1
fi
echo "Stage 2 training completed!"
echo "Both training stages completed successfully!"
echo "Final model saved in: $OUTPUT_DIR_STAGE2"